From d4acd722b7bb5f48b9fc3848e8c2a845b100d84f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 31 Oct 2007 09:38:04 -0500
Subject: [SCSI] sysfs: add filter function to groups

This patch allows the various users of attribute_groups to selectively
allow the appearance of group attributes.  The primary consumer of
this will be the transport classes in which we currently have
elaborate attribute selection algorithms to do this same thing.

Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 7686417..dfef464 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -472,7 +472,7 @@ param_sysfs_setup(struct module_kobject *mk,
 			sizeof(mp->grp.attrs[0]));
 	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
 
-	mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+	mp = kzalloc(size[0] + size[1], GFP_KERNEL);
 	if (!mp)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.1


From fabe874a48de45b137f99b4ed3641e0413f465ce Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 24 Jan 2008 07:00:45 +0100
Subject: lockdep: fix kernel crash on module unload

Michael Wu noticed in his lkml post at

    http://marc.info/?l=linux-kernel&m=119396182726091&w=2

that certain wireless drivers ended up having their name in module
memory, which would then crash the kernel on module unload.

The patch he proposed was a bit clumsy in that it increased the size of
a lockdep entry significantly; the patch below tries another approach,
it checks, on module teardown, if the name of a class is in module space
and then zaps the class.  This is very similar to what we already do
with keys that are in module space.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/lockdep.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 4335f12..e2c07ece 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2932,7 +2932,7 @@ static void zap_class(struct lock_class *class)
 
 }
 
-static inline int within(void *addr, void *start, unsigned long size)
+static inline int within(const void *addr, void *start, unsigned long size)
 {
 	return addr >= start && addr < start + size;
 }
@@ -2955,9 +2955,12 @@ void lockdep_free_key_range(void *start, unsigned long size)
 		head = classhash_table + i;
 		if (list_empty(head))
 			continue;
-		list_for_each_entry_safe(class, next, head, hash_entry)
+		list_for_each_entry_safe(class, next, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
+			else if (within(class->name, start, size))
+				zap_class(class);
+		}
 	}
 
 	if (locked)
-- 
cgit v1.1


From 3514faca19a6fdc209734431c509631ea92b094e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 16 Oct 2007 10:11:44 -0600
Subject: kobject: remove struct kobj_type from struct kset

We don't need a "default" ktype for a kset.  We should set this
explicitly every time for each kset.  This change is needed so that we
can make ksets dynamic, and cleans up one of the odd, undocumented
assumption that the kset/kobject/ktype model has.

This patch is based on a lot of help from Kay Sievers.

Nasty bug in the block code was found by Dave Young
<hidave.darkstar@gmail.com>

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c     | 2 +-
 kernel/module.c     | 2 +-
 kernel/params.c     | 9 +++++----
 kernel/power/main.c | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65daa53..094e2bc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -94,7 +94,7 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-decl_subsys(kernel, NULL, NULL);
+decl_subsys(kernel, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
diff --git a/kernel/module.c b/kernel/module.c
index c2e3e2e..68df797 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1223,7 +1223,7 @@ int mod_sysfs_init(struct module *mod)
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
 		goto out;
-	kobj_set_kset_s(&mod->mkobj, module_subsys);
+	mod->mkobj.kobj.kset = &module_subsys;
 	mod->mkobj.mod = mod;
 
 	kobject_init(&mod->mkobj.kobj);
diff --git a/kernel/params.c b/kernel/params.c
index 7686417..9f05182 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,6 +30,8 @@
 #define DEBUGP(fmt, a...)
 #endif
 
+static struct kobj_type module_ktype;
+
 static inline char dash2underscore(char c)
 {
 	if (c == '-')
@@ -560,7 +562,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
-	kobj_set_kset_s(mk, module_subsys);
+	mk->kobj.kset = &module_subsys;
+	mk->kobj.ktype = &module_ktype;
 	kobject_set_name(&mk->kobj, name);
 	kobject_init(&mk->kobj);
 	ret = kobject_add(&mk->kobj);
@@ -679,8 +682,6 @@ static struct sysfs_ops module_sysfs_ops = {
 	.store = module_attr_store,
 };
 
-static struct kobj_type module_ktype;
-
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
@@ -694,7 +695,7 @@ static struct kset_uevent_ops module_uevent_ops = {
 	.filter = uevent_filter,
 };
 
-decl_subsys(module, &module_ktype, &module_uevent_ops);
+decl_subsys(module, &module_uevent_ops);
 int module_sysfs_initialized;
 
 static void module_release(struct kobject *kobj)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f71c950..1ef31c9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-decl_subsys(power,NULL,NULL);
+decl_subsys(power, NULL);
 
 
 /**
-- 
cgit v1.1


From 4ff6abff832fbc6cb1d769f6106c841bc2b09f63 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 5 Nov 2007 22:24:43 -0800
Subject: kobject: get rid of kobject_add_dir

kobject_create_and_add is the same as kobject_add_dir, so drop
kobject_add_dir.


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 68df797..5514277 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1122,7 +1122,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
 		++loaded;
 	}
 
-	notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes");
+	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
 	if (!notes_attrs->dir)
 		goto out;
 
@@ -1243,7 +1243,7 @@ int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out;
 
-	mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
+	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
 		goto out_unreg;
@@ -2521,7 +2521,7 @@ static void module_create_drivers_dir(struct module_kobject *mk)
 	if (!mk || mk->drivers_dir)
 		return;
 
-	mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
+	mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
 }
 
 void module_add_driver(struct module *mod, struct device_driver *drv)
-- 
cgit v1.1


From bd35b93d8049ab47b5bfaf6b10ba39badf21d1c3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 29 Oct 2007 20:13:17 +0100
Subject: kset: convert kernel_subsys to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename kernel_subsys to kernel_kset to catch all users of this symbol
with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 42 ++++++++++++++++++++++++++++++------------
 kernel/user.c   |  4 ++--
 2 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 094e2bc..cf02d4b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -94,8 +94,8 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-decl_subsys(kernel, NULL);
-EXPORT_SYMBOL_GPL(kernel_subsys);
+struct kset *kernel_kset;
+EXPORT_SYMBOL_GPL(kernel_kset);
 
 static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +116,42 @@ static struct attribute_group kernel_attr_group = {
 
 static int __init ksysfs_init(void)
 {
-	int error = subsystem_register(&kernel_subsys);
-	if (!error)
-		error = sysfs_create_group(&kernel_subsys.kobj,
-					   &kernel_attr_group);
+	int error;
 
-	if (!error && notes_size > 0) {
+	kernel_kset = kset_create_and_add("kernel", NULL, NULL);
+	if (!kernel_kset) {
+		error = -ENOMEM;
+		goto exit;
+	}
+	error = sysfs_create_group(&kernel_kset->kobj, &kernel_attr_group);
+	if (error)
+		goto kset_exit;
+
+	if (notes_size > 0) {
 		notes_attr.size = notes_size;
-		error = sysfs_create_bin_file(&kernel_subsys.kobj,
-					      &notes_attr);
+		error = sysfs_create_bin_file(&kernel_kset->kobj, &notes_attr);
+		if (error)
+			goto group_exit;
 	}
 
 	/*
 	 * Create "/sys/kernel/uids" directory and corresponding root user's
 	 * directory under it.
 	 */
-	if (!error)
-		error = uids_kobject_init();
-
+	error = uids_kobject_init();
+	if (error)
+		goto notes_exit;
+
+	return 0;
+
+notes_exit:
+	if (notes_size > 0)
+		sysfs_remove_bin_file(&kernel_kset->kobj, &notes_attr);
+group_exit:
+	sysfs_remove_group(&kernel_kset->kobj, &kernel_attr_group);
+kset_exit:
+	kset_unregister(kernel_kset);
+exit:
 	return error;
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index 8320a87..80f1116 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -198,8 +198,8 @@ int __init uids_kobject_init(void)
 	int error;
 
 	/* create under /sys/kernel dir */
-	uids_kobject.parent = &kernel_subsys.kobj;
-	uids_kobject.kset = &kernel_subsys;
+	uids_kobject.parent = &kernel_kset->kobj;
+	uids_kobject.kset = kernel_kset;
 	kobject_set_name(&uids_kobject, "uids");
 	kobject_init(&uids_kobject);
 
-- 
cgit v1.1


From 7405c1e15edfe43b137bfbc5882f1af34d6d414d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 1 Nov 2007 10:39:50 -0700
Subject: kset: convert /sys/module to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename module_subsys to module_kset to catch all users of the variable.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c |  7 +++----
 kernel/params.c | 29 +++++++++--------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5514277..d03fcd9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,8 +47,6 @@
 #include <asm/cacheflush.h>
 #include <linux/license.h>
 
-extern int module_sysfs_initialized;
-
 #if 0
 #define DEBUGP printk
 #else
@@ -1223,7 +1221,8 @@ int mod_sysfs_init(struct module *mod)
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
 		goto out;
-	mod->mkobj.kobj.kset = &module_subsys;
+	mod->mkobj.kobj.kset = module_kset;
+	mod->mkobj.kobj.ktype = &module_ktype;
 	mod->mkobj.mod = mod;
 
 	kobject_init(&mod->mkobj.kobj);
@@ -2539,7 +2538,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
 		struct kobject *mkobj;
 
 		/* Lookup built-in module entry in /sys/modules */
-		mkobj = kset_find_obj(&module_subsys, drv->mod_name);
+		mkobj = kset_find_obj(module_kset, drv->mod_name);
 		if (mkobj) {
 			mk = container_of(mkobj, struct module_kobject, kobj);
 			/* remember our module structure */
diff --git a/kernel/params.c b/kernel/params.c
index 9f05182..97e0923 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,8 +30,6 @@
 #define DEBUGP(fmt, a...)
 #endif
 
-static struct kobj_type module_ktype;
-
 static inline char dash2underscore(char c)
 {
 	if (c == '-')
@@ -562,7 +560,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
-	mk->kobj.kset = &module_subsys;
+	mk->kobj.kset = module_kset;
 	mk->kobj.ktype = &module_ktype;
 	kobject_set_name(&mk->kobj, name);
 	kobject_init(&mk->kobj);
@@ -695,7 +693,7 @@ static struct kset_uevent_ops module_uevent_ops = {
 	.filter = uevent_filter,
 };
 
-decl_subsys(module, &module_uevent_ops);
+struct kset *module_kset;
 int module_sysfs_initialized;
 
 static void module_release(struct kobject *kobj)
@@ -707,7 +705,7 @@ static void module_release(struct kobject *kobj)
 	 */
 }
 
-static struct kobj_type module_ktype = {
+struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
 	.release =	module_release,
 };
@@ -717,13 +715,11 @@ static struct kobj_type module_ktype = {
  */
 static int __init param_sysfs_init(void)
 {
-	int ret;
-
-	ret = subsystem_register(&module_subsys);
-	if (ret < 0) {
-		printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
-			__FILE__, __LINE__, ret);
-		return ret;
+	module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
+	if (!module_kset) {
+		printk(KERN_WARNING "%s (%d): error creating kset\n",
+			__FILE__, __LINE__);
+		return -ENOMEM;
 	}
 	module_sysfs_initialized = 1;
 
@@ -733,14 +729,7 @@ static int __init param_sysfs_init(void)
 }
 subsys_initcall(param_sysfs_init);
 
-#else
-#if 0
-static struct sysfs_ops module_sysfs_ops = {
-	.show = NULL,
-	.store = NULL,
-};
-#endif
-#endif
+#endif /* CONFIG_SYSFS */
 
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
-- 
cgit v1.1


From 039a5dcd2fc45188a2d522df630db4f7ef903a0f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 1 Nov 2007 10:39:50 -0700
Subject: kset: convert /sys/power to use kset_create

Dynamically create the kset instead of declaring it statically.  We also
rename power_subsys to power_kset to catch all users of the variable and
we properly export it so that people don't have to guess that it really
is present in the system.

The pseries code is wierd, why is it createing /sys/power if CONFIG_PM
is disabled?  Oh well, stupid big boxes ignoring config options...

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/disk.c  |  2 +-
 kernel/power/main.c  | 11 +++++------
 kernel/power/power.h |  2 --
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 05b6479..c3f0e61 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -708,7 +708,7 @@ static struct attribute_group attr_group = {
 
 static int __init pm_disk_init(void)
 {
-	return sysfs_create_group(&power_subsys.kobj, &attr_group);
+	return sysfs_create_group(&power_kset->kobj, &attr_group);
 }
 
 core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1ef31c9..dce2d76 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-decl_subsys(power, NULL);
-
+struct kset *power_kset;
 
 /**
  *	state - control system power state.
@@ -386,10 +385,10 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-	int error = subsystem_register(&power_subsys);
-	if (!error)
-		error = sysfs_create_group(&power_subsys.kobj,&attr_group);
-	return error;
+	power_kset = kset_create_and_add("power", NULL, NULL);
+	if (!power_kset)
+		return -ENOMEM;
+	return sysfs_create_group(&power_kset->kobj, &attr_group);
 }
 
 core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 195dc46..1083e6b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = {	\
 	.store	= _name##_store,		\
 }
 
-extern struct kset power_subsys;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
-- 
cgit v1.1


From 386f275f5d097758f867bc99ddeaeb7a03b6b190 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: Driver Core: switch all dynamic ksets to kobj_sysfs_ops

Switch all dynamically created ksets, that export simple attributes,
to kobj_attribute from subsys_attribute. Struct subsys_attribute will
be removed.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mike Halcrow <mhalcrow@us.ibm.com>
Cc: Phillip Hellewell <phillip@hellewell.homeip.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c      | 35 +++++++++++++++++++++--------------
 kernel/power/disk.c  | 18 ++++++++++++------
 kernel/power/main.c  | 12 ++++++++----
 kernel/power/power.h |  2 +-
 4 files changed, 42 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index cf02d4b..dd0f9e7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -17,30 +17,34 @@
 #include <linux/sched.h>
 
 #define KERNEL_ATTR_RO(_name) \
-static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
 #define KERNEL_ATTR_RW(_name) \
-static struct subsys_attribute _name##_attr = \
+static struct kobj_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 /* current uevent sequence number */
-static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
+static ssize_t uevent_seqnum_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
-static ssize_t uevent_helper_show(struct kset *kset, char *page)
+static ssize_t uevent_helper_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%s\n", uevent_helper);
+	return sprintf(buf, "%s\n", uevent_helper);
 }
-static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
+static ssize_t uevent_helper_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
 {
 	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(uevent_helper, page, count);
+	memcpy(uevent_helper, buf, count);
 	uevent_helper[count] = '\0';
 	if (count && uevent_helper[count-1] == '\n')
 		uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
 #endif
 
 #ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct kset *kset, char *page)
+static ssize_t kexec_loaded_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%d\n", !!kexec_image);
+	return sprintf(buf, "%d\n", !!kexec_image);
 }
 KERNEL_ATTR_RO(kexec_loaded);
 
-static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
+static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%d\n", !!kexec_crash_image);
+	return sprintf(buf, "%d\n", !!kexec_crash_image);
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
 
-static ssize_t vmcoreinfo_show(struct kset *kset, char *page)
+static ssize_t vmcoreinfo_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(page, "%lx %x\n",
+	return sprintf(buf, "%lx %x\n",
 		       paddr_vmcoreinfo_note(),
 		       (unsigned int)vmcoreinfo_max_size);
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c3f0e61..ef5aa2c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = {
  *	supports it (as determined by having hibernation_ops).
  */
 
-static ssize_t disk_show(struct kset *kset, char *buf)
+static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
 {
 	int i;
 	char *start = buf;
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
 }
 
 
-static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t n)
 {
 	int error = 0;
 	int i;
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(disk);
 
-static ssize_t resume_show(struct kset *kset, char *buf)
+static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
+			   char *buf)
 {
 	return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
+			    const char *buf, size_t n)
 {
 	unsigned int maj, min;
 	dev_t res;
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
 
 power_attr(resume);
 
-static ssize_t image_size_show(struct kset *kset, char *buf)
+static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
+			       char *buf)
 {
 	return sprintf(buf, "%lu\n", image_size);
 }
 
-static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
+				const char *buf, size_t n)
 {
 	unsigned long size;
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index dce2d76..b813949 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -289,7 +289,8 @@ struct kset *power_kset;
  *	proper enumerated value, and initiates a suspend transition.
  */
 
-static ssize_t state_show(struct kset *kset, char *buf)
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+			  char *buf)
 {
 	char *s = buf;
 #ifdef CONFIG_SUSPEND
@@ -310,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
 	return (s - buf);
 }
 
-static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+			   const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
 	suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -347,13 +349,15 @@ power_attr(state);
 #ifdef CONFIG_PM_TRACE
 int pm_trace_enabled;
 
-static ssize_t pm_trace_show(struct kset *kset, char *buf)
+static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
 {
 	return sprintf(buf, "%d\n", pm_trace_enabled);
 }
 
 static ssize_t
-pm_trace_store(struct kset *kset, const char *buf, size_t n)
+pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
+	       const char *buf, size_t n)
 {
 	int val;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1083e6b..2093c3a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long);
 extern struct mutex pm_mutex;
 
 #define power_attr(_name) \
-static struct subsys_attribute _name##_attr = {	\
+static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
 		.name = __stringify(_name),	\
 		.mode = 0644,			\
-- 
cgit v1.1


From eb41d9465cdafee45e0cb30f3b7338646221908e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 2 Nov 2007 13:47:53 +0100
Subject: fix struct user_info export's sysfs interaction

Clean up the use of ksets and kobjects. Kobjects are instances of
objects (like struct user_info), ksets are collections of objects of a
similar type (like the uids directory containing the user_info directories).
So, use kobjects for the user_info directories, and a kset for the "uids"
directory.

On object cleanup, the final kobject_put() was missing.

Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c |   7 ++--
 kernel/user.c   | 104 +++++++++++++++++++++++++++-----------------------------
 2 files changed, 53 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index dd0f9e7..45e6465 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -141,11 +141,8 @@ static int __init ksysfs_init(void)
 			goto group_exit;
 	}
 
-	/*
-	 * Create "/sys/kernel/uids" directory and corresponding root user's
-	 * directory under it.
-	 */
-	error = uids_kobject_init();
+	/* create the /sys/kernel/uids/ directory */
+	error = uids_sysfs_init();
 	if (error)
 		goto notes_exit;
 
diff --git a/kernel/user.c b/kernel/user.c
index 80f1116..5a106f3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
 
 #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
 
-static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
 
 static inline void uids_mutex_lock(void)
@@ -128,86 +128,84 @@ static inline void uids_mutex_unlock(void)
 	mutex_unlock(&uids_mutex);
 }
 
-/* return cpu shares held by the user */
-static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+/* uid directory attributes */
+static ssize_t cpu_shares_show(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       char *buf)
 {
-	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 
-	return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+	return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
 }
 
-/* modify cpu shares held by the user */
-static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
-				size_t size)
+static ssize_t cpu_shares_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t size)
 {
-	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 	unsigned long shares;
 	int rc;
 
-	sscanf(buffer, "%lu", &shares);
+	sscanf(buf, "%lu", &shares);
 
 	rc = sched_group_set_shares(up->tg, shares);
 
 	return (rc ? rc : size);
 }
 
-static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+static struct kobj_attribute cpu_share_attr =
+	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+
+/* default attributes per uid directory */
+static struct attribute *uids_attributes[] = {
+	&cpu_share_attr.attr,
+	NULL
+};
+
+/* the lifetime of user_struct is not managed by the core (now) */
+static void uids_release(struct kobject *kobj)
 {
-	sa->attr.name = name;
-	sa->attr.mode = mode;
-	sa->show = cpu_shares_show;
-	sa->store = cpu_shares_store;
+	return;
 }
 
-/* Create "/sys/kernel/uids/<uid>" directory and
- *  "/sys/kernel/uids/<uid>/cpu_share" file for this user.
- */
-static int user_kobject_create(struct user_struct *up)
+static struct kobj_type uids_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = uids_attributes,
+	.release = uids_release,
+};
+
+/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
+static int uids_user_create(struct user_struct *up)
 {
-	struct kset *kset = &up->kset;
-	struct kobject *kobj = &kset->kobj;
+	struct kobject *kobj = &up->kobj;
 	int error;
 
-	memset(kset, 0, sizeof(struct kset));
-	kobj->parent = &uids_kobject;	/* create under /sys/kernel/uids dir */
-	kobject_set_name(kobj, "%d", up->uid);
-	kset_init(kset);
-	user_attr_init(&up->user_attr, "cpu_share", 0644);
-
+	memset(kobj, 0, sizeof(struct kobject));
+	kobj->ktype = &uids_ktype;
+	kobj->kset = uids_kset;
+	kobject_init(kobj);
+	kobject_set_name(&up->kobj, "%d", up->uid);
 	error = kobject_add(kobj);
 	if (error)
 		goto done;
 
-	error = sysfs_create_file(kobj, &up->user_attr.attr);
-	if (error)
-		kobject_del(kobj);
-
 	kobject_uevent(kobj, KOBJ_ADD);
-
 done:
 	return error;
 }
 
-/* create these in sysfs filesystem:
+/* create these entries in sysfs:
  * 	"/sys/kernel/uids" directory
  * 	"/sys/kernel/uids/0" directory (for root user)
  * 	"/sys/kernel/uids/0/cpu_share" file (for root user)
  */
-int __init uids_kobject_init(void)
+int __init uids_sysfs_init(void)
 {
-	int error;
+	uids_kset = kset_create_and_add("uids", NULL, &kernel_kset->kobj);
+	if (!uids_kset)
+		return -ENOMEM;
 
-	/* create under /sys/kernel dir */
-	uids_kobject.parent = &kernel_kset->kobj;
-	uids_kobject.kset = kernel_kset;
-	kobject_set_name(&uids_kobject, "uids");
-	kobject_init(&uids_kobject);
-
-	error = kobject_add(&uids_kobject);
-	if (!error)
-		error = user_kobject_create(&root_user);
-
-	return error;
+	return uids_user_create(&root_user);
 }
 
 /* work function to remove sysfs directory for a user and free up
@@ -216,7 +214,6 @@ int __init uids_kobject_init(void)
 static void remove_user_sysfs_dir(struct work_struct *w)
 {
 	struct user_struct *up = container_of(w, struct user_struct, work);
-	struct kobject *kobj = &up->kset.kobj;
 	unsigned long flags;
 	int remove_user = 0;
 
@@ -238,9 +235,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	if (!remove_user)
 		goto done;
 
-	sysfs_remove_file(kobj, &up->user_attr.attr);
-	kobject_uevent(kobj, KOBJ_REMOVE);
-	kobject_del(kobj);
+	kobject_uevent(&up->kobj, KOBJ_REMOVE);
+	kobject_del(&up->kobj);
+	kobject_put(&up->kobj);
 
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
@@ -267,7 +264,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 
 #else	/* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
 
-static inline int user_kobject_create(struct user_struct *up) { return 0; }
+int uids_sysfs_init(void) { return 0; }
+static inline int uids_user_create(struct user_struct *up) { return 0; }
 static inline void uids_mutex_lock(void) { }
 static inline void uids_mutex_unlock(void) { }
 
@@ -324,7 +322,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
-	/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	 * atomic.
 	 */
 	uids_mutex_lock();
@@ -370,7 +368,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 			return NULL;
 		}
 
-		if (user_kobject_create(new)) {
+		if (uids_user_create(new)) {
 			sched_destroy_user(new);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
-- 
cgit v1.1


From 0ff21e46630abce11fdaaffabd72bbd4eed5ac2c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 6 Nov 2007 10:36:58 -0800
Subject: kobject: convert kernel_kset to be a kobject

kernel_kset does not need to be a kset, but a much simpler kobject now
that we have kobj_attributes.

We also rename kernel_kset to kernel_kobj to catch all users of this
symbol with a build error instead of an easy-to-ignore build warning.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 18 +++++++++---------
 kernel/user.c   |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 45e6465..1081aff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,8 +101,8 @@ static struct bin_attribute notes_attr = {
 	.read = &notes_read,
 };
 
-struct kset *kernel_kset;
-EXPORT_SYMBOL_GPL(kernel_kset);
+struct kobject *kernel_kobj;
+EXPORT_SYMBOL_GPL(kernel_kobj);
 
 static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -125,18 +125,18 @@ static int __init ksysfs_init(void)
 {
 	int error;
 
-	kernel_kset = kset_create_and_add("kernel", NULL, NULL);
-	if (!kernel_kset) {
+	kernel_kobj = kobject_create_and_add("kernel", NULL);
+	if (!kernel_kobj) {
 		error = -ENOMEM;
 		goto exit;
 	}
-	error = sysfs_create_group(&kernel_kset->kobj, &kernel_attr_group);
+	error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
 	if (error)
 		goto kset_exit;
 
 	if (notes_size > 0) {
 		notes_attr.size = notes_size;
-		error = sysfs_create_bin_file(&kernel_kset->kobj, &notes_attr);
+		error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
 		if (error)
 			goto group_exit;
 	}
@@ -150,11 +150,11 @@ static int __init ksysfs_init(void)
 
 notes_exit:
 	if (notes_size > 0)
-		sysfs_remove_bin_file(&kernel_kset->kobj, &notes_attr);
+		sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
-	sysfs_remove_group(&kernel_kset->kobj, &kernel_attr_group);
+	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
-	kset_unregister(kernel_kset);
+	kobject_unregister(kernel_kobj);
 exit:
 	return error;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 5a106f3..7f17e6e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -201,7 +201,7 @@ done:
  */
 int __init uids_sysfs_init(void)
 {
-	uids_kset = kset_create_and_add("uids", NULL, &kernel_kset->kobj);
+	uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
 	if (!uids_kset)
 		return -ENOMEM;
 
-- 
cgit v1.1


From d76e15fb20eeb7632ef38876a884fe3508b2c01d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 27 Nov 2007 11:28:26 -0800
Subject: driver core: make /sys/power a kobject

/sys/power should not be a kset, that's overkill.  This patch renames it
to power_kset and fixes up all usages of it in the tree.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/disk.c | 2 +-
 kernel/power/main.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index ef5aa2c..b138b43 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -714,7 +714,7 @@ static struct attribute_group attr_group = {
 
 static int __init pm_disk_init(void)
 {
-	return sysfs_create_group(&power_kset->kobj, &attr_group);
+	return sysfs_create_group(power_kobj, &attr_group);
 }
 
 core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b813949..efc0836 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
 
 #endif /* CONFIG_SUSPEND */
 
-struct kset *power_kset;
+struct kobject *power_kobj;
 
 /**
  *	state - control system power state.
@@ -389,10 +389,10 @@ static struct attribute_group attr_group = {
 
 static int __init pm_init(void)
 {
-	power_kset = kset_create_and_add("power", NULL, NULL);
-	if (!power_kset)
+	power_kobj = kobject_create_and_add("power", NULL);
+	if (!power_kobj)
 		return -ENOMEM;
-	return sysfs_create_group(&power_kset->kobj, &attr_group);
+	return sysfs_create_group(power_kobj, &attr_group);
 }
 
 core_initcall(pm_init);
-- 
cgit v1.1


From e43b9192c59402685bd1f809068dd13aa5931570 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/params.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/params.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 97e0923..1078b14 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -561,11 +561,9 @@ static void __init kernel_param_sysfs_setup(const char *name,
 
 	mk->mod = THIS_MODULE;
 	mk->kobj.kset = module_kset;
-	mk->kobj.ktype = &module_ktype;
-	kobject_set_name(&mk->kobj, name);
-	kobject_init(&mk->kobj);
-	ret = kobject_add(&mk->kobj);
+	ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
 	if (ret) {
+		kobject_put(&mk->kobj);
 		printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
 		      "error number %d\n", name, ret);
 		printk(KERN_ERR	"The system will be unstable now.\n");
-- 
cgit v1.1


From cf15126b3d4511e06e5299781ab74922590900be Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/user.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/user.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 7f17e6e..ab4fd70 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -181,13 +181,12 @@ static int uids_user_create(struct user_struct *up)
 	int error;
 
 	memset(kobj, 0, sizeof(struct kobject));
-	kobj->ktype = &uids_ktype;
 	kobj->kset = uids_kset;
-	kobject_init(kobj);
-	kobject_set_name(&up->kobj, "%d", up->uid);
-	error = kobject_add(kobj);
-	if (error)
+	error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
+	if (error) {
+		kobject_put(kobj);
 		goto done;
+	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 done:
-- 
cgit v1.1


From c63469a3985a9771c18a916b8d42845d044ea0b1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 28 Nov 2007 12:23:18 -0800
Subject: Driver core: move the driver specific module code into the driver
 core

The module driver specific code should belong in the driver core, not in
the kernel/ directory.  So move this code.  This is done in preparation
for some struct device_driver rework that should be confined to the
driver core code only.

This also lets us keep from exporting these functions, as no external
code should ever be calling it.

Thanks to Andrew Morton for the !CONFIG_MODULES fix.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 87 ---------------------------------------------------------
 1 file changed, 87 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d03fcd9..dc4d3f5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2501,93 +2501,6 @@ void print_modules(void)
 	printk("\n");
 }
 
-#ifdef CONFIG_SYSFS
-static char *make_driver_name(struct device_driver *drv)
-{
-	char *driver_name;
-
-	driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
-			      GFP_KERNEL);
-	if (!driver_name)
-		return NULL;
-
-	sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
-	return driver_name;
-}
-
-static void module_create_drivers_dir(struct module_kobject *mk)
-{
-	if (!mk || mk->drivers_dir)
-		return;
-
-	mk->drivers_dir = kobject_create_and_add("drivers", &mk->kobj);
-}
-
-void module_add_driver(struct module *mod, struct device_driver *drv)
-{
-	char *driver_name;
-	int no_warn;
-	struct module_kobject *mk = NULL;
-
-	if (!drv)
-		return;
-
-	if (mod)
-		mk = &mod->mkobj;
-	else if (drv->mod_name) {
-		struct kobject *mkobj;
-
-		/* Lookup built-in module entry in /sys/modules */
-		mkobj = kset_find_obj(module_kset, drv->mod_name);
-		if (mkobj) {
-			mk = container_of(mkobj, struct module_kobject, kobj);
-			/* remember our module structure */
-			drv->mkobj = mk;
-			/* kset_find_obj took a reference */
-			kobject_put(mkobj);
-		}
-	}
-
-	if (!mk)
-		return;
-
-	/* Don't check return codes; these calls are idempotent */
-	no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
-	driver_name = make_driver_name(drv);
-	if (driver_name) {
-		module_create_drivers_dir(mk);
-		no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
-					    driver_name);
-		kfree(driver_name);
-	}
-}
-EXPORT_SYMBOL(module_add_driver);
-
-void module_remove_driver(struct device_driver *drv)
-{
-	struct module_kobject *mk = NULL;
-	char *driver_name;
-
-	if (!drv)
-		return;
-
-	sysfs_remove_link(&drv->kobj, "module");
-
-	if (drv->owner)
-		mk = &drv->owner->mkobj;
-	else if (drv->mkobj)
-		mk = drv->mkobj;
-	if (mk && mk->drivers_dir) {
-		driver_name = make_driver_name(drv);
-		if (driver_name) {
-			sysfs_remove_link(mk->drivers_dir, driver_name);
-			kfree(driver_name);
-		}
-	}
-}
-EXPORT_SYMBOL(module_remove_driver);
-#endif
-
 #ifdef CONFIG_MODVERSIONS
 /* Generate the signature for struct module here, too, for modversions. */
 void struct_module(struct module *mod) { return; }
-- 
cgit v1.1


From 97c146ef075dc40ae34407c03d3860fc3850b8e8 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 29 Nov 2007 23:46:11 +0100
Subject: sysfs: fix /sys/module/*/holders after sysfs logic change

Sysfs symlinks now require fully registered kobjects as a target,
otherwise the call to create a symlink will fail. Here we register
the kobject before we request the symlink in the holders directory.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tejun Heo <teheo@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dc4d3f5..0ae8117 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1227,6 +1227,8 @@ int mod_sysfs_init(struct module *mod)
 
 	kobject_init(&mod->mkobj.kobj);
 
+	/* delay uevent until full sysfs population */
+	err = kobject_add(&mod->mkobj.kobj);
 out:
 	return err;
 }
@@ -1237,11 +1239,6 @@ int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
-	/* delay uevent until full sysfs population */
-	err = kobject_add(&mod->mkobj.kobj);
-	if (err)
-		goto out;
-
 	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
 	if (!mod->holders_dir) {
 		err = -ENOMEM;
@@ -1266,7 +1263,6 @@ out_unreg_holders:
 out_unreg:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-out:
 	return err;
 }
 #endif
@@ -1883,10 +1879,10 @@ static struct module *load_module(void __user *umod,
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
-	/* Initialize kobject, so we can reference it. */
+	/* add kobject, so we can reference it. */
 	err = mod_sysfs_init(mod);
 	if (err)
-		goto cleanup;
+		goto free_unload;
 
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
@@ -2056,6 +2052,9 @@ static struct module *load_module(void __user *umod,
  arch_cleanup:
 	module_arch_cleanup(mod);
  cleanup:
+	kobject_del(&mod->mkobj.kobj);
+	kobject_put(&mod->mkobj.kobj);
+ free_unload:
 	module_unload_free(mod);
 	module_free(mod, mod->module_init);
  free_core:
-- 
cgit v1.1


From ac3c8141f62f357169980ec21b7be6d29964a394 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 23:05:35 -0700
Subject: Kobject: convert kernel/module.c to use kobject_init/add_ng()

This converts the code to use the new kobject functions, cleaning up the
logic in doing so.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0ae8117..89cd4c7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1217,18 +1217,16 @@ int mod_sysfs_init(struct module *mod)
 		err = -EINVAL;
 		goto out;
 	}
-	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
-	if (err)
-		goto out;
-	mod->mkobj.kobj.kset = module_kset;
-	mod->mkobj.kobj.ktype = &module_ktype;
 	mod->mkobj.mod = mod;
 
-	kobject_init(&mod->mkobj.kobj);
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	mod->mkobj.kobj.kset = module_kset;
+	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+				   "%s", mod->name);
+	if (err)
+		kobject_put(&mod->mkobj.kobj);
 
 	/* delay uevent until full sysfs population */
-	err = kobject_add(&mod->mkobj.kobj);
 out:
 	return err;
 }
-- 
cgit v1.1


From 7a6a41615bfb2f03ce797bc24104c50b42c935e5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Sat, 22 Dec 2007 21:18:25 -0800
Subject: Modules: remove unneeded release function

Now that kobjects properly clean up their name structures, no matter if
they have a release function or not, we can drop this empty module
kobject release function too (it was needed prior to this because of the
way we handled static kobject names, we based the fact that if a release
function was present, then we could safely free the name string, now we
are more smart about things and only free names we have previously set.)

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/params.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 1078b14..b4da950 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -694,18 +694,8 @@ static struct kset_uevent_ops module_uevent_ops = {
 struct kset *module_kset;
 int module_sysfs_initialized;
 
-static void module_release(struct kobject *kobj)
-{
-	/*
-	 * Stupid empty release function to allow the memory for the kobject to
-	 * be properly cleaned up.  This will not need to be present for 2.6.25
-	 * with the upcoming kobject core rework.
-	 */
-}
-
 struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
-	.release =	module_release,
 };
 
 /*
-- 
cgit v1.1


From 78a2d906b40fe530ea800c1e873bfe8f02326f1e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 20 Dec 2007 08:13:05 -0800
Subject: Kobject: convert remaining kobject_unregister() to kobject_put()

There is no need for kobject_unregister() anymore, thanks to Kay's
kobject cleanup changes, so replace all instances of it with
kobject_put().


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 2 +-
 kernel/module.c | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1081aff..e53bc30 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -154,7 +154,7 @@ notes_exit:
 group_exit:
 	sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
-	kobject_unregister(kernel_kobj);
+	kobject_put(kernel_kobj);
 exit:
 	return error;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 89cd4c7..dcb8a2c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1257,9 +1257,8 @@ int mod_sysfs_setup(struct module *mod,
 out_unreg_param:
 	module_param_sysfs_remove(mod);
 out_unreg_holders:
-	kobject_unregister(mod->holders_dir);
+	kobject_put(mod->holders_dir);
 out_unreg:
-	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
 	return err;
 }
@@ -1269,9 +1268,9 @@ static void mod_kobject_remove(struct module *mod)
 {
 	module_remove_modinfo_attrs(mod);
 	module_param_sysfs_remove(mod);
-	kobject_unregister(mod->mkobj.drivers_dir);
-	kobject_unregister(mod->holders_dir);
-	kobject_unregister(&mod->mkobj.kobj);
+	kobject_put(mod->mkobj.drivers_dir);
+	kobject_put(mod->holders_dir);
+	kobject_put(&mod->mkobj.kobj);
 }
 
 /*
-- 
cgit v1.1


From af5ca3f4ec5cc4432a42a73b050dd8898ce8fd00 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 20 Dec 2007 02:09:39 +0100
Subject: Driver core: change sysdev classes to use dynamic kobject names

All kobjects require a dynamically allocated name now. We no longer
need to keep track if the name is statically assigned, we can just
unconditionally free() all kobject names on cleanup.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rtmutex-tester.c   | 2 +-
 kernel/time/clocksource.c | 2 +-
 kernel/time/timekeeping.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba..092e4c6 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
 static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
 
 static struct sysdev_class rttest_sysclass = {
-	set_kset_name("rttest"),
+	.name = "rttest",
 };
 
 static int init_test_thread(int id)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13..8d6125a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -441,7 +441,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-	set_kset_name("clocksource"),
+	.name = "clocksource",
 };
 
 static struct sys_device device_clocksource = {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b..ab46ae8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -335,9 +335,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
+	.name		= "timekeeping",
 	.resume		= timekeeping_resume,
 	.suspend	= timekeeping_suspend,
-	set_kset_name("timekeeping"),
 };
 
 static struct sys_device device_timer = {
-- 
cgit v1.1


From 6ea6dd93c9454cc9521134f907bc970d09f460e4 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Tue, 27 Nov 2007 13:02:40 +0100
Subject: ptrace: Call arch_ptrace_attach() when request=PTRACE_TRACEME

arch_ptrace_attach() is a hook that allows the architecture to do
book-keeping after a ptrace attach. This patch adds a call to this
hook when handling a PTRACE_TRACEME request as well.

Currently only one architecture, m32r, implements this hook. When
called, it initializes a number of debug trap slots in the ptraced
task's thread struct, and it looks to me like this is the right thing
to do after a PTRACE_TRACEME request as well, not only after
PTRACE_ATTACH. Please correct me if I'm wrong.

I want to use this hook on AVR32 to turn the debugging hardware on
when a process is actually being debugged and keep it off otherwise.
To be able to do this, I need to intercept PTRACE_TRACEME and
PTRACE_ATTACH, as well as PTRACE_DETACH and thread exit. The latter
two can be handled by existing hooks.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
---
 kernel/ptrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c25db86..c719bb9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -470,6 +470,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
+		if (!ret)
+			arch_ptrace_attach(current);
 		goto out;
 	}
 
-- 
cgit v1.1


From 32a76006683f7b28ae3cc491da37716e002f198e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:58 +0100
Subject: printk: make printk more robust by not allowing recursion

make printk more robust by allowing recursion only if there's a crash
going on. Also add recursion detection.

I've tested it with an artificially injected printk recursion - instead
of a lockup or spontaneous reboot or other crash, the output was a well
controlled:

[   41.057335] SysRq : <2>BUG: recent printk recursion!
[   41.057335] loglevel0-8 reBoot Crashdump show-all-locks(D) tErm Full kIll saK showMem Nice powerOff showPc show-all-timers(Q) unRaw Sync showTasks Unmount shoW-blocked-tasks

also do all this printk-debug logic with irqs disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Nick Piggin <npiggin@suse.de>
---
 kernel/printk.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf..e6c1f36 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -628,30 +628,57 @@ asmlinkage int printk(const char *fmt, ...)
 /* cpu currently holding logbuf_lock */
 static volatile unsigned int printk_cpu = UINT_MAX;
 
+const char printk_recursion_bug_msg [] =
+			KERN_CRIT "BUG: recent printk recursion!\n";
+static int printk_recursion_bug;
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
+	static int log_level_unknown = 1;
+	static char printk_buf[1024];
+
 	unsigned long flags;
-	int printed_len;
+	int printed_len = 0;
+	int this_cpu;
 	char *p;
-	static char printk_buf[1024];
-	static int log_level_unknown = 1;
 
 	boot_delay_msec();
 
 	preempt_disable();
-	if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
-		/* If a crash is occurring during printk() on this CPU,
-		 * make sure we can't deadlock */
-		zap_locks();
-
 	/* This stops the holder of console_sem just where we want him */
 	raw_local_irq_save(flags);
+	this_cpu = smp_processor_id();
+
+	/*
+	 * Ouch, printk recursed into itself!
+	 */
+	if (unlikely(printk_cpu == this_cpu)) {
+		/*
+		 * If a crash is occurring during printk() on this CPU,
+		 * then try to get the crash message out but make sure
+		 * we can't deadlock. Otherwise just return to avoid the
+		 * recursion and return - but flag the recursion so that
+		 * it can be printed at the next appropriate moment:
+		 */
+		if (!oops_in_progress) {
+			printk_recursion_bug = 1;
+			goto out_restore_irqs;
+		}
+		zap_locks();
+	}
+
 	lockdep_off();
 	spin_lock(&logbuf_lock);
-	printk_cpu = smp_processor_id();
+	printk_cpu = this_cpu;
 
+	if (printk_recursion_bug) {
+		printk_recursion_bug = 0;
+		strcpy(printk_buf, printk_recursion_bug_msg);
+		printed_len = sizeof(printk_recursion_bug_msg);
+	}
 	/* Emit the output into the temporary buffer */
-	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	printed_len += vscnprintf(printk_buf + printed_len,
+				  sizeof(printk_buf), fmt, args);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
@@ -744,6 +771,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		printk_cpu = UINT_MAX;
 		spin_unlock(&logbuf_lock);
 		lockdep_on();
+out_restore_irqs:
 		raw_local_irq_restore(flags);
 	}
 
-- 
cgit v1.1


From d713f519332e029d43eca8462629314eee1ded86 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:58 +0100
Subject: sched: fix CONFIG_PRINT_TIME's reliance on sched_clock()

Stefano Brivio reported weird printk timestamp behavior during
CPU frequency changes:

  http://bugzilla.kernel.org/show_bug.cgi?id=9475

fix CONFIG_PRINT_TIME's reliance on sched_clock() and use cpu_clock()
instead.

Reported-and-bisected-by: Stefano Brivio <stefano.brivio@polimi.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e6c1f36..5f9d053 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -707,7 +707,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = printk_clock();
+				t = cpu_clock(printk_cpu);
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.1


From b842271fbb9c8b5fd0e1c3e1895a3b67ba5bcc54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: remove printk_clock()

printk_clock() is obsolete - it has been replaced with cpu_clock().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 5f9d053..3b7c968 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
 
 __setup("time", printk_time_setup);
 
-__attribute__((weak)) unsigned long long printk_clock(void)
-{
-	return sched_clock();
-}
-
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
-- 
cgit v1.1


From 93f992ccc008dd4030381caeebb252e85e66684b Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: group scheduling code cleanup

Minor cleanups:

- Fix coding style
- remove obsolete comment

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11c..7f827b7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -191,12 +191,12 @@ struct task_group init_task_group = {
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	2*NICE_0_LOAD
 #else
-# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-static int init_task_group_load = INIT_TASK_GRP_LOAD;
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
@@ -881,21 +881,6 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 
 #define sched_class_highest (&rt_sched_class)
 
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->load
- * and when switching tasks.
- */
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
 	update_load_add(&rq->load, p->se.load.weight);
-- 
cgit v1.1


From ec2c507fe8c8fa3c04fc6cb99a382a965c477379 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:07:59 +0100
Subject: sched: group scheduling, minor fixes

Minor bug fixes for the group scheduler:

- Use a mutex to serialize add/remove of task groups and also when
  changing shares of a task group. Use the same mutex when printing
  cfs_rq debugging stats for various task groups.

- Use list_for_each_entry_rcu in for_each_leaf_cfs_rq macro (when
  walking task group list)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 33 +++++++++++++++++++++++++--------
 kernel/sched_fair.c |  4 +++-
 2 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7f827b7..cfa6958 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -169,8 +169,6 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
-	/* spinlock to serialize modification to shares */
-	spinlock_t lock;
 	struct rcu_head rcu;
 };
 
@@ -182,6 +180,11 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
+/* task_group_mutex serializes add/remove of task groups and also changes to
+ * a task group's cpu shares.
+ */
+static DEFINE_MUTEX(task_group_mutex);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -221,9 +224,21 @@ static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
 	p->se.parent = task_group(p)->se[cpu];
 }
 
+static inline void lock_task_group_list(void)
+{
+	mutex_lock(&task_group_mutex);
+}
+
+static inline void unlock_task_group_list(void)
+{
+	mutex_unlock(&task_group_mutex);
+}
+
 #else
 
 static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void lock_task_group_list(void) { }
+static inline void unlock_task_group_list(void) { }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
@@ -6768,7 +6783,6 @@ void __init sched_init(void)
 			se->parent = NULL;
 		}
 		init_task_group.shares = init_task_group_load;
-		spin_lock_init(&init_task_group.lock);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7008,14 +7022,15 @@ struct task_group *sched_create_group(void)
 		se->parent = NULL;
 	}
 
+	tg->shares = NICE_0_LOAD;
+
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	}
-
-	tg->shares = NICE_0_LOAD;
-	spin_lock_init(&tg->lock);
+	unlock_task_group_list();
 
 	return tg;
 
@@ -7061,10 +7076,12 @@ void sched_destroy_group(struct task_group *tg)
 	struct cfs_rq *cfs_rq = NULL;
 	int i;
 
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 	}
+	unlock_task_group_list();
 
 	BUG_ON(!cfs_rq);
 
@@ -7146,7 +7163,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (shares < 2)
 		shares = 2;
 
-	spin_lock(&tg->lock);
+	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 
@@ -7155,7 +7172,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		set_se_shares(tg->se[i], shares);
 
 done:
-	spin_unlock(&tg->lock);
+	unlock_task_group_list();
 	return 0;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061..0c5fdce 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -690,7 +690,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline int
@@ -1132,7 +1132,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
+	lock_task_group_list();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
+	unlock_task_group_list();
 }
 #endif
-- 
cgit v1.1


From 58e2d4ca581167c2a079f4ee02be2f0bc52e8729 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: group scheduling, change how cpu load is calculated

This patch changes how the cpu load exerted by fair_sched_class tasks
is calculated. Load exerted by fair_sched_class tasks on a cpu is now
a summation of the group weights, rather than summation of task weights.
Weight exerted by a group on a cpu is dependent on the shares allocated
to it.

This version of patch has a minor impact on code size, but should have
no runtime/functional impact for !CONFIG_FAIR_GROUP_SCHED.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 27 +++++++++++----------------
 kernel/sched_fair.c | 31 +++++++++++++++++++++++++++----
 kernel/sched_rt.c   |  2 ++
 3 files changed, 40 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa6958..c915f3e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -886,6 +886,16 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_sub(&rq->load, load);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -896,26 +906,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 
 #define sched_class_highest (&rt_sched_class)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -4087,10 +4085,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4100,7 +4096,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c5fdce..30ae9c2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -760,15 +760,26 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL;	/* Highest schedulable entity */
+	int incload = 1;
 
 	for_each_sched_entity(se) {
-		if (se->on_rq)
+		topse = se;
+		if (se->on_rq) {
+			incload = 0;
 			break;
+		}
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
+	/* Increment cpu load if we just enqueued the first task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (incload)
+		inc_cpu_load(rq, topse->load.weight);
 }
 
 /*
@@ -779,16 +790,28 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL; 	/* Highest schedulable entity */
+	int decload = 1;
 
 	for_each_sched_entity(se) {
+		topse = se;
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		if (cfs_rq->load.weight) {
+			if (parent_entity(se))
+				decload = 0;
 			break;
+		}
 		sleep = 1;
 	}
+	/* Decrement cpu load if we just dequeued the last task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (decload)
+		dec_cpu_load(rq, topse->load.weight);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa..cefcd51 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -32,6 +32,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 /*
@@ -46,6 +47,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.1


From a183561567b5446d3362b4839bd4f744f4b2af1e Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: introduce a mutex and corresponding API to serialize access to
 doms_curarray

doms_cur[] array represents various scheduling domains which are
mutually exclusive. Currently cpusets code can modify this array (by
calling partition_sched_domains()) as a result of user modifying
sched_load_balance flag for various cpusets.

This patch introduces a mutex and corresponding API (only when
CONFIG_FAIR_GROUP_SCHED is defined) which allows a reader to safely read
the doms_cur[] array w/o worrying abt concurrent modifications to the
array.

The fair group scheduler code (introduced in next patch of this series)
makes use of this mutex to walk thr' doms_cur[] array while rebalancing
shares of task groups across cpus.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c915f3e..d9585f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -185,6 +185,9 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
  */
 static DEFINE_MUTEX(task_group_mutex);
 
+/* doms_cur_mutex serializes access to doms_cur[] array */
+static DEFINE_MUTEX(doms_cur_mutex);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -234,11 +237,23 @@ static inline void unlock_task_group_list(void)
 	mutex_unlock(&task_group_mutex);
 }
 
+static inline void lock_doms_cur(void)
+{
+	mutex_lock(&doms_cur_mutex);
+}
+
+static inline void unlock_doms_cur(void)
+{
+	mutex_unlock(&doms_cur_mutex);
+}
+
 #else
 
 static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
+static inline void lock_doms_cur(void) { }
+static inline void unlock_doms_cur(void) { }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
@@ -6543,6 +6558,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 	int i, j;
 
+	lock_doms_cur();
+
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
@@ -6583,6 +6600,8 @@ match2:
 	ndoms_cur = ndoms_new;
 
 	register_sched_domain_sysctl();
+
+	unlock_doms_cur();
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-- 
cgit v1.1


From 6b2d7700266b9402e12824e11e0099ae6a4a6a79 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:00 +0100
Subject: sched: group scheduler, fix fairness of cpu bandwidth allocation for
 task groups

The current load balancing scheme isn't good enough for precise
group fairness.

For example: on a 8-cpu system, I created 3 groups as under:

	a = 8 tasks (cpu.shares = 1024)
	b = 4 tasks (cpu.shares = 1024)
	c = 3 tasks (cpu.shares = 1024)

a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.

This is what I get with the latest scheduler git tree:

Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1  | Col2    | Col3  |  Col4
------|---------|-------|-------------------------------------------------------
a     | 277.676 | 57.8% | 54.1%  54.1%  54.1%  54.2%  56.7%  62.2%  62.8% 64.5%
b     | 116.108 | 24.2% | 47.4%  48.1%  48.7%  49.3%
c     |  86.326 | 18.0% | 47.5%  47.9%  48.5%
--------------------------------------------------------------------------------

Explanation of o/p:

Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
	group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
        percentage. Col3 data is derived as:
		Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
		Col4 = 100 * cpu_time_recd_by_task / 60

[I can share the test case that produces a similar o/p if reqd]

The deviation from desired group fairness is as below:

	a = +24.47%
	b = -9.13%
	c = -15.33%

which is quite high.

After the patch below is applied, here are the results:

--------------------------------------------------------------------------------
Col1  | Col2    | Col3  |  Col4
------|---------|-------|-------------------------------------------------------
a     | 163.112 | 34.0% | 33.2%  33.4%  33.5%  33.5%  33.7%  34.4%  34.8% 35.3%
b     | 156.220 | 32.5% | 63.3%  64.5%  66.1%  66.5%
c     | 160.653 | 33.5% | 85.8%  90.6%  91.4%
--------------------------------------------------------------------------------

Deviation from desired group fairness is as below:

	a = +0.67%
	b = -0.83%
	c = +0.17%

which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.

Why do we see bad (group) fairness with current scheuler?
=========================================================

Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:

	CPU0	CPU1
---------------------------
	A (10)  B(5)
		C(5)
---------------------------

Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).

The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:

	A = 50%
	B = 25%
	C = 25%

which is not the desired result.

What's changing in the patch?
=============================

	- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
	  defined (see below)
	- API Change
		- Two tunables introduced in sysfs (under SCHED_DEBUG) to
		  control the frequency at which the load balance monitor
		  thread runs.

The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.

Let,
	W(K,i)  = Weight of group K on cpu i
	T(K,i)  = Task load present in group K's cfs_rq on cpu i
	T(K)    = Total task load of group K across various cpus
	S(K) 	= Shares allocated to group K
	NRCPUS	= Number of online cpus in the scheduler domain to
	 	  which group K is assigned.

Then,
	W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)

A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.

Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>

- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 270 +++++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c |  84 ++++++++++------
 kernel/sysctl.c     |  18 ++++
 3 files changed, 327 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d9585f1..86e55a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
+
+	/*
+	 * shares assigned to a task group governs how much of cpu bandwidth
+	 * is allocated to the group. The more shares a group has, the more is
+	 * the cpu bandwidth allocated to it.
+	 *
+	 * For ex, lets say that there are three task groups, A, B and C which
+	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
+	 * should be:
+	 *
+	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+	 * 	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *
+	 * The weight assigned to a task group's schedulable entities on every
+	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+	 * group's shares. For ex: lets say that task group A has been
+	 * assigned shares of 1000 and there are two CPUs in a system. Then,
+	 *
+	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+	 *
+	 * Note: It's not necessary that each of a task's group schedulable
+	 * 	 entity have the same weight on all CPUs. If the group
+	 * 	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 * 	 better distribution of weight could be:
+	 *
+	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
+	 *
+	 * rebalance_shares() is responsible for distributing the shares of a
+	 * task groups like this among the group's schedulable entities across
+	 * cpus.
+	 *
+	 */
 	unsigned long shares;
+
 	struct rcu_head rcu;
 };
 
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
 
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
+#define MIN_GROUP_SHARES       2
+
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 /* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
+	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+					 "group_balance");
+	if (!IS_ERR(lb_monitor_task)) {
+		lb_monitor_task->flags |= PF_NOFREEZE;
+		wake_up_process(lb_monitor_task);
+	} else {
+		printk(KERN_ERR "Could not create load balance monitor thread"
+			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
+	}
+#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+#ifdef CONFIG_SMP
+/*
+ * distribute shares of all task groups among their schedulable entities,
+ * to reflect load distrbution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(this_cpu);
+	cpumask_t sdspan = sd->span;
+	int balanced = 1;
+
+	/* Walk thr' all the task groups that we have */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		int i;
+		unsigned long total_load = 0, total_shares;
+		struct task_group *tg = cfs_rq->tg;
+
+		/* Gather total task load of this group across cpus */
+		for_each_cpu_mask(i, sdspan)
+			total_load += tg->cfs_rq[i]->load.weight;
+
+		/* Nothing to do if this group has no load  */
+		if (!total_load)
+			continue;
+
+		/*
+		 * tg->shares represents the number of cpu shares the task group
+		 * is eligible to hold on a single cpu. On N cpus, it is
+		 * eligible to hold (N * tg->shares) number of cpu shares.
+		 */
+		total_shares = tg->shares * cpus_weight(sdspan);
+
+		/*
+		 * redistribute total_shares across cpus as per the task load
+		 * distribution.
+		 */
+		for_each_cpu_mask(i, sdspan) {
+			unsigned long local_load, local_shares;
+
+			local_load = tg->cfs_rq[i]->load.weight;
+			local_shares = (local_load * total_shares) / total_load;
+			if (!local_shares)
+				local_shares = MIN_GROUP_SHARES;
+			if (local_shares == tg->se[i]->load.weight)
+				continue;
+
+			spin_lock_irq(&cpu_rq(i)->lock);
+			set_se_shares(tg->se[i], local_shares);
+			spin_unlock_irq(&cpu_rq(i)->lock);
+			balanced = 0;
+		}
+	}
+
+	return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate tradeoff between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+/* kernel thread that runs rebalance_shares() periodically */
+static int load_balance_monitor(void *unused)
+{
+	unsigned int timeout = sysctl_sched_min_bal_int_shares;
+	struct sched_param schedparm;
+	int ret;
+
+	/*
+	 * We don't want this thread's execution to be limited by the shares
+	 * assigned to default group (init_task_group). Hence make it run
+	 * as a SCHED_RR RT task at the lowest priority.
+	 */
+	schedparm.sched_priority = 1;
+	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
+	if (ret)
+		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
+				" monitor thread (error = %d) \n", ret);
+
+	while (!kthread_should_stop()) {
+		int i, cpu, balanced = 1;
+
+		/* Prevent cpus going down or coming up */
+		lock_cpu_hotplug();
+		/* lockout changes to doms_cur[] array */
+		lock_doms_cur();
+		/*
+		 * Enter a rcu read-side critical section to safely walk rq->sd
+		 * chain on various cpus and to walk task group list
+		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
+		 */
+		rcu_read_lock();
+
+		for (i = 0; i < ndoms_cur; i++) {
+			cpumask_t cpumap = doms_cur[i];
+			struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+			cpu = first_cpu(cpumap);
+
+			/* Find the highest domain at which to balance shares */
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_LOAD_BALANCE))
+					continue;
+				sd_prev = sd;
+			}
+
+			sd = sd_prev;
+			/* sd == NULL? No load balance reqd in this domain */
+			if (!sd)
+				continue;
+
+			balanced &= rebalance_shares(sd, cpu);
+		}
+
+		rcu_read_unlock();
+
+		unlock_doms_cur();
+		unlock_cpu_hotplug();
+
+		if (!balanced)
+			timeout = sysctl_sched_min_bal_int_shares;
+		else if (timeout < sysctl_sched_max_bal_int_shares)
+			timeout *= 2;
+
+		msleep_interruptible(timeout);
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_SMP */
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
@@ -7144,47 +7356,77 @@ done:
 	task_rq_unlock(rq, &flags);
 }
 
+/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
+	if (!shares)
+		shares = MIN_GROUP_SHARES;
 
 	on_rq = se->on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_entity(cfs_rq, se, 0);
+		dec_cpu_load(rq, se->load.weight);
+	}
 
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 
-	if (on_rq)
+	if (on_rq) {
 		enqueue_entity(cfs_rq, se, 0);
-
-	spin_unlock_irq(&rq->lock);
+		inc_cpu_load(rq, se->load.weight);
+	}
 }
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
-
-	/*
-	 * A weight of 0 or 1 can cause arithmetics problems.
-	 * (The default weight is 1024 - so there's no practical
-	 *  limitation from this.)
-	 */
-	if (shares < 2)
-		shares = 2;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 
 	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 
+	if (shares < MIN_GROUP_SHARES)
+		shares = MIN_GROUP_SHARES;
+
+	/*
+	 * Prevent any load balance activity (rebalance_shares,
+	 * load_balance_fair) from referring to this group first,
+	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
+	 */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	/* wait for any ongoing reference to this group to finish */
+	synchronize_sched();
+
+	/*
+	 * Now we are free to modify the group's share on each cpu
+	 * w/o tripping rebalance_share or load_balance_fair.
+	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		spin_lock_irq(&cpu_rq(i)->lock);
 		set_se_shares(tg->se[i], shares);
+		spin_unlock_irq(&cpu_rq(i)->lock);
+	}
 
+	/*
+	 * Enable load balance activity on this group, by inserting it back on
+	 * each cpu's rq->leaf_cfs_rq_list.
+	 */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
 done:
 	unlock_task_group_list();
 	return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 30ae9c2..5c208e0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
+#define GROUP_IMBALANCE_PCT	20
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running)
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
-
-	p = task_of(curr);
-
-	return p->prio;
-}
-#endif
-
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
+	unsigned long load_moved;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
-		long imbalance;
-		unsigned long maxload;
+		struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+		unsigned long maxload, task_load, group_weight;
+		unsigned long thisload, per_task_load;
+		struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		task_load = busy_cfs_rq->load.weight;
+		group_weight = se->load.weight;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		/*
+		 * 'group_weight' is contributed by tasks of total weight
+		 * 'task_load'. To move 'rem_load_move' worth of weight only,
+		 * we need to move a maximum task load of:
+		 *
+		 * 	maxload = (remload / group_weight) * task_load;
+		 */
+		maxload = (rem_load_move * task_load) / group_weight;
+
+		if (!maxload || !task_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		per_task_load = task_load / busy_cfs_rq->nr_running;
+		/*
+		 * balance_tasks will try to forcibly move atleast one task if
+		 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+		 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+		 */
+		 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+			continue;
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+		/* Disable priority-based load balance */
+		*this_best_prio = 0;
+		thisload = this_cfs_rq->load.weight;
 #else
 # define maxload rem_load_move
 #endif
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+		load_moved = balance_tasks(this_rq, this_cpu, busiest,
 					       maxload, sd, idle, all_pinned,
 					       this_best_prio,
 					       &cfs_rq_iterator);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		/*
+		 * load_moved holds the task load that was moved. The
+		 * effective (group) weight moved would be:
+		 * 	load_moved_eff = load_moved/task_load * group_weight;
+		 */
+		load_moved = (group_weight * load_moved) / task_load;
+
+		/* Adjust shares on both cpus to reflect load_moved */
+		group_weight -= load_moved;
+		set_se_shares(se, group_weight);
+
+		se = busy_cfs_rq->tg->se[this_cpu];
+		if (!thisload)
+			group_weight = load_moved;
+		else
+			group_weight = se->load.weight + load_moved;
+		set_se_shares(se, group_weight);
+#endif
+
+		rem_load_move -= load_moved;
+
 		if (rem_load_move <= 0)
 			break;
 	}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c68f68d..c95f3ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = {
 		.mode		= 644,
 		.proc_handler	= &proc_dointvec,
 	},
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_min_bal_int_shares",
+		.data           = &sysctl_sched_min_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_max_bal_int_shares",
+		.data           = &sysctl_sched_max_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.1


From d221938c049f4845da13c8593132595a6b9222a8 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:01 +0100
Subject: cpu-hotplug: refcount based cpu hotplug

This patch implements a Refcount + Waitqueue based model for
cpu-hotplug.

Now, a thread which wants to prevent cpu-hotplug, will bump up a global
refcount and the thread which wants to perform a cpu-hotplug operation
will block till the global refcount goes to zero.

The readers, if any, during an ongoing cpu-hotplug operation are blocked
until the cpu-hotplug operation is over.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Paul Jackson <pj@sgi.com> [For !CONFIG_HOTPLUG_CPU ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 152 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 111 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c1..656dc3f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
-/* This protects CPUs going up and down... */
+/* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
-static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
  */
 static int cpu_hotplug_disabled;
 
-#ifdef CONFIG_HOTPLUG_CPU
+static struct {
+	struct task_struct *active_writer;
+	struct mutex lock; /* Synchronizes accesses to refcount, */
+	/*
+	 * Also blocks the new readers during
+	 * an ongoing cpu hotplug operation.
+	 */
+	int refcount;
+	wait_queue_head_t writer_queue;
+} cpu_hotplug;
 
-/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-static struct task_struct *recursive;
-static int recursive_depth;
+#define writer_exists() (cpu_hotplug.active_writer != NULL)
+
+void __init cpu_hotplug_init(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_init(&cpu_hotplug.lock);
+	cpu_hotplug.refcount = 0;
+	init_waitqueue_head(&cpu_hotplug.writer_queue);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
 
 void lock_cpu_hotplug(void)
 {
-	struct task_struct *tsk = current;
-
-	if (tsk == recursive) {
-		static int warnings = 10;
-		if (warnings) {
-			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
-			WARN_ON(1);
-			warnings--;
-		}
-		recursive_depth++;
+	might_sleep();
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	mutex_lock(&cpu_bitmask_lock);
-	recursive = tsk;
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount++;
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
 void unlock_cpu_hotplug(void)
 {
-	WARN_ON(recursive != current);
-	if (recursive_depth) {
-		recursive_depth--;
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	recursive = NULL;
-	mutex_unlock(&cpu_bitmask_lock);
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount--;
+
+	if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
+		wake_up(&cpu_hotplug.writer_queue);
+
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+	mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+	mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_maps_update_begin is always called after invoking
+ * cpu_maps_update_begin, we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ *   writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ *   non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * lock_cpu_hotplug() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	mutex_lock(&cpu_hotplug.lock);
+
+	cpu_hotplug.active_writer = current;
+	add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
+	while (cpu_hotplug.refcount) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&cpu_hotplug.lock);
+		schedule();
+		mutex_lock(&cpu_hotplug.lock);
+	}
+	remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
+}
+
+static void cpu_hotplug_done(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_unlock(&cpu_hotplug.lock);
+}
 /* Need to know about CPUs going up/down? */
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return ret;
 }
 
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -147,6 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
+	cpu_hotplug_begin();
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
@@ -166,9 +237,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
-	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
@@ -203,6 +272,7 @@ out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 	return err;
 }
 
@@ -210,13 +280,13 @@ int cpu_down(unsigned int cpu)
 {
 	int err = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_down(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,6 +301,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
+	cpu_hotplug_begin();
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
@@ -243,9 +314,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	}
 
 	/* Arch-specific enabling code. */
-	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -258,6 +327,7 @@ out_notify:
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 
 	return ret;
 }
@@ -275,13 +345,13 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_up(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -292,7 +362,7 @@ int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	first_cpu = first_cpu(cpu_online_map);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +389,7 @@ int disable_nonboot_cpus(void)
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return error;
 }
 
@@ -328,7 +398,7 @@ void enable_nonboot_cpus(void)
 	int cpu, error;
 
 	/* Allow everyone to use the CPU hotplug again */
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	if (cpus_empty(frozen_cpus))
 		goto out;
@@ -344,6 +414,6 @@ void enable_nonboot_cpus(void)
 	}
 	cpus_clear(frozen_cpus);
 out:
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
-- 
cgit v1.1


From 86ef5c9a8edd78e6bf92879f32329d89b2d55b5a Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: cpu-hotplug: replace lock_cpu_hotplug() with get_online_cpus()

Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
get_online_cpus and put_online_cpus instead as it highlights the
refcount semantics in these operations.

The new API guarantees protection against the cpu-hotplug operation, but
it doesn't guarantee serialized access to any of the local data
structures. Hence the changes needs to be reviewed.

In case of pseries_add_processor/pseries_remove_processor, use
cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
cpu_present_map there.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c          | 10 +++++-----
 kernel/cpuset.c       | 14 +++++++-------
 kernel/rcutorture.c   |  6 +++---
 kernel/sched.c        |  4 ++--
 kernel/stop_machine.c |  4 ++--
 5 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 656dc3f..b0c4152 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,7 +48,7 @@ void __init cpu_hotplug_init(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-void lock_cpu_hotplug(void)
+void get_online_cpus(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
@@ -58,9 +58,9 @@ void lock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);
 
-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
@@ -73,7 +73,7 @@ void unlock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
@@ -110,7 +110,7 @@ void cpu_maps_update_done(void)
  *   non zero and goes to sleep again.
  *
  * However, this is very difficult to achieve in practice since
- * lock_cpu_hotplug() not an api which is called all that often.
+ * get_online_cpus() not an api which is called all that often.
  *
  */
 static void cpu_hotplug_begin(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc4..cfaf641 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  *
  * Call with cgroup_mutex held.  May take callback_mutex during
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
  * Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex.  Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
 
 rebuild:
 	/* Have scheduler rebuild sched domains */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	partition_sched_domains(ndoms, doms);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 done:
 	if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
  *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The lock_cpu_hotplug()
+ * will call rebuild_sched_domains().  The get_online_cpus()
  * call in rebuild_sched_domains() must not be made while holding
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
+ * get_online_cpus() calls.  So the reverse nesting would risk an
  * ABBA deadlock.
  */
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c..fd59982 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return;
 	}
 
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
 	else
 		rcu_idle_cpu--;
 
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/sched.c b/kernel/sched.c
index 86e55a9..672aa68 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7152,7 +7152,7 @@ static int load_balance_monitor(void *unused)
 		int i, cpu, balanced = 1;
 
 		/* Prevent cpus going down or coming up */
-		lock_cpu_hotplug();
+		get_online_cpus();
 		/* lockout changes to doms_cur[] array */
 		lock_doms_cur();
 		/*
@@ -7186,7 +7186,7 @@ static int load_balance_monitor(void *unused)
 		rcu_read_unlock();
 
 		unlock_doms_cur();
-		unlock_cpu_hotplug();
+		put_online_cpus();
 
 		if (!balanced)
 			timeout = sysctl_sched_min_bal_int_shares;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821e..51b5ee5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	int ret;
 
 	/* No CPUs can come up or down during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	p = __stop_machine_run(fn, data, cpu);
 	if (!IS_ERR(p))
 		ret = kthread_stop(p);
 	else
 		ret = PTR_ERR(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
-- 
cgit v1.1


From 95402b3829010fe1e208f44e4a158ccade88969a Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: cpu-hotplug: replace per-subsystem mutexes with get_online_cpus()

This patch converts the known per-subsystem mutexes to get_online_cpus
put_online_cpus. It also eliminates the CPU_LOCK_ACQUIRE and
CPU_LOCK_RELEASE hotplug notification events.

Signed-off-by: Gautham  R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c       |  4 ----
 kernel/sched.c     | 25 +++++++++----------------
 kernel/workqueue.c | 35 +++++++++++++++--------------------
 3 files changed, 24 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b0c4152..e0d3a4f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -218,7 +218,6 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
@@ -271,7 +270,6 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	cpu_hotplug_done();
 	return err;
 }
@@ -302,7 +300,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		return -EINVAL;
 
 	cpu_hotplug_begin();
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
@@ -326,7 +323,6 @@ out_notify:
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
 	cpu_hotplug_done();
 
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index 672aa68..c0e2db6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -439,7 +439,6 @@ struct rq {
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
@@ -4546,13 +4545,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		mutex_unlock(&sched_hotcpu_mutex);
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -4592,7 +4591,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	}
 out_unlock:
 	put_task_struct(p);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -4649,7 +4648,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -4665,7 +4664,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return retval;
 }
@@ -5625,9 +5624,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	struct rq *rq;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&sched_hotcpu_mutex);
-		break;
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -5697,9 +5693,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&sched_hotcpu_mutex);
-		break;
 	}
 	return NOTIFY_OK;
 }
@@ -6655,10 +6648,10 @@ static int arch_reinit_sched_domains(void)
 {
 	int err;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return err;
 }
@@ -6769,12 +6762,12 @@ void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8db0b59..52db48e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
 #endif
 };
 
-/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
-   threads to each one as cpus come/go. */
-static DEFINE_MUTEX(workqueue_mutex);
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  * Returns zero on success.
  * Returns -ve errno on failure.
  *
- * Appears to be racy against CPU hotplug.
- *
  * schedule_on_each_cpu() is very slow.
  */
 int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	preempt_disable();		/* CPU hotplug */
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	preempt_enable();
 	flush_workqueue(keventd_wq);
+	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 	} else {
-		mutex_lock(&workqueue_mutex);
+		get_online_cpus();
+		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
 
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
-		mutex_unlock(&workqueue_mutex);
+		put_online_cpus();
 	}
 
 	if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	/*
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * workqueue_mutex protects cwq->thread
+	 * get_online_cpus() protects cwq->thread.
 	 */
 	if (cwq->thread == NULL)
 		return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	struct cpu_workqueue_struct *cwq;
 	int cpu;
 
-	mutex_lock(&workqueue_mutex);
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
-	mutex_unlock(&workqueue_mutex);
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&workqueue_mutex);
-		return NOTIFY_OK;
-
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&workqueue_mutex);
-		return NOTIFY_OK;
 
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		case CPU_UP_PREPARE:
 			if (!create_workqueue_thread(cwq, cpu))
 				break;
-			printk(KERN_ERR "workqueue for %i failed\n", cpu);
+			printk(KERN_ERR "workqueue [%s] for %i failed\n",
+				wq->name, cpu);
 			return NOTIFY_BAD;
 
 		case CPU_ONLINE:
-- 
cgit v1.1


From 82a1fcb90287052aabfa235e7ffc693ea003fe69 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks

this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:

 ------------------>
 INFO: task prctl:3042 blocked for more than 120 seconds.
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
 prctl         D fd5e3793     0  3042   2997
        f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
        f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
        f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
 Call Trace:
  [<c04883a5>] schedule_timeout+0x6d/0x8b
  [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
  [<c0133a76>] msleep+0x10/0x16
  [<c0138974>] sys_prctl+0x30/0x1e2
  [<c0104c52>] sysenter_past_esp+0x5f/0xa5
  =======================
 2 locks held by prctl/3042:
 #0:  (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
 #1:  (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
 <------------------

the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.

if lockdep is enabled then all held locks are printed as well.

this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).

[ Gautham R Shenoy <ego@in.ibm.com>: CPU hotplug fixes. ]
[ Andrew Morton <akpm@linux-foundation.org>: build warning fix. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/fork.c       |   5 +++
 kernel/lockdep.c    |  12 +++++-
 kernel/sched.c      |   4 +-
 kernel/softlockup.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c     |  27 +++++++++++++
 5 files changed, 149 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff2..09c0b90 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	p->last_switch_count = 0;
+	p->last_switch_timestamp = 0;
+#endif
+
 #ifdef CONFIG_TASK_XACCT
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e2c07ece..3574379 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3206,7 +3206,11 @@ retry:
 
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
 {
 	if (unlikely(!debug_locks)) {
 		printk("INFO: lockdep is turned off.\n");
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
 	}
 	lockdep_print_held_locks(task);
 }
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+		__debug_show_held_locks(task);
+}
 
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index c0e2db6..5b3d465 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,7 +4945,7 @@ out_unlock:
 
 static const char stat_nam[] = "RSDTtZX";
 
-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
@@ -4998,7 +4998,7 @@ void show_state_filter(unsigned long state_filter)
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
-			show_task(p);
+			sched_show_task(p);
 	} while_each_thread(g, p);
 
 	touch_all_softlockup_watchdogs();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812..02f0ad5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
  */
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;
 
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
  */
 static unsigned long get_timestamp(int this_cpu)
 {
-	return cpu_clock(this_cpu) >> 30;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
 void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
 
 	now = get_timestamp(this_cpu);
 
-	/* Wake up the high-prio watchdog task every second: */
-	if (now > (touch_timestamp + 1))
-		wake_up_process(per_cpu(watchdog_task, this_cpu));
-
-	/* Warn about unreasonable 10+ seconds delays: */
+	/* Warn about unreasonable delays: */
 	if (now <= (touch_timestamp + softlockup_thresh))
 		return;
 
@@ -122,11 +119,93 @@ void softlockup_tick(void)
 }
 
 /*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+/*
+ * Only do the hung-tasks check on one CPU:
+ */
+static int check_cpu __read_mostly = -1;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (sysctl_hung_task_warnings < 0)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp(this_cpu);
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if ((tainted & TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			break;
+		if (t->state & TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+
+	read_unlock(&tasklist_lock);
+}
+
+/*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
 
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
-	 * If this gets delayed for more than 10 seconds then the
+	 * If this gets delayed for more than 60 seconds then the
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
-		schedule();
+		msleep_interruptible(10000);
+
+		if (this_cpu != check_cpu)
+			continue;
+
+		if (sysctl_hung_task_timeout_secs)
+			check_hung_uninterruptible_tasks(this_cpu);
 	}
 
 	return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
+		check_cpu = any_online_cpu(cpu_online_map);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 			     any_online_cpu(cpu_online_map));
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		if (hotcpu == check_cpu) {
+			cpumask_t temp_cpu_online_map = cpu_online_map;
+
+			cpu_clear(hotcpu, temp_cpu_online_map);
+			check_cpu = any_online_cpu(temp_cpu_online_map);
+		}
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c95f3ed..96f31c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &sixty,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_check_count",
+		.data		= &sysctl_hung_task_check_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_timeout_secs",
+		.data		= &sysctl_hung_task_timeout_secs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_warnings",
+		.data		= &sysctl_hung_task_warnings,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #ifdef CONFIG_COMPAT
 	{
-- 
cgit v1.1


From 63489e45e265f64c368882be1f01c42dec5d984c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:03 +0100
Subject: sched: count # of queued RT tasks

This patch adds accounting to keep track of the number of RT tasks running
on a runqueue. This information will be used in later patches.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  1 +
 kernel/sched_rt.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b3d465..9dd8d12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -342,6 +342,7 @@ struct rt_rq {
 	struct rt_prio_array active;
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+	unsigned long rt_nr_running;
 };
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cefcd51..e3d2ca8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -26,6 +26,19 @@ static void update_curr_rt(struct rq *rq)
 	cpuacct_charge(curr, delta_exec);
 }
 
+static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+	WARN_ON(!rt_task(p));
+	rq->rt.rt_nr_running++;
+}
+
+static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+{
+	WARN_ON(!rt_task(p));
+	WARN_ON(!rq->rt.rt_nr_running);
+	rq->rt.rt_nr_running--;
+}
+
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct rt_prio_array *array = &rq->rt.active;
@@ -33,6 +46,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	inc_cpu_load(rq, p->se.load.weight);
+
+	inc_rt_tasks(p, rq);
 }
 
 /*
@@ -48,6 +63,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 	dec_cpu_load(rq, p->se.load.weight);
+
+	dec_rt_tasks(p, rq);
 }
 
 /*
-- 
cgit v1.1


From 764a9d6fe4b52995c8aba277e3634385699354f4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:04 +0100
Subject: sched: track highest prio task queued

This patch adds accounting to each runqueue to keep track of the
highest prio task queued on the run queue. We only care about
RT tasks, so if the run queue does not contain any active RT tasks
its priority will be considered MAX_RT_PRIO.

This information will be used for later patches.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  3 +++
 kernel/sched_rt.c | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9dd8d12..6185fa0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -343,6 +343,8 @@ struct rt_rq {
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+	/* highest queued rt task prio */
+	int highest_prio;
 };
 
 /*
@@ -6864,6 +6866,7 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
+		rq->rt.highest_prio = MAX_RT_PRIO;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e3d2ca8..136c285 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -30,6 +30,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 {
 	WARN_ON(!rt_task(p));
 	rq->rt.rt_nr_running++;
+#ifdef CONFIG_SMP
+	if (p->prio < rq->rt.highest_prio)
+		rq->rt.highest_prio = p->prio;
+#endif /* CONFIG_SMP */
 }
 
 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -37,6 +41,20 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 	WARN_ON(!rt_task(p));
 	WARN_ON(!rq->rt.rt_nr_running);
 	rq->rt.rt_nr_running--;
+#ifdef CONFIG_SMP
+	if (rq->rt.rt_nr_running) {
+		struct rt_prio_array *array;
+
+		WARN_ON(p->prio < rq->rt.highest_prio);
+		if (p->prio == rq->rt.highest_prio) {
+			/* recalculate */
+			array = &rq->rt.active;
+			rq->rt.highest_prio =
+				sched_find_first_bit(array->bitmap);
+		} /* otherwise leave rq->highest prio alone */
+	} else
+		rq->rt.highest_prio = MAX_RT_PRIO;
+#endif /* CONFIG_SMP */
 }
 
 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
-- 
cgit v1.1


From e8fa136262e1121288bb93befe2295928ffd240d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:05 +0100
Subject: sched: add RT task pushing

This patch adds an algorithm to push extra RT tasks off a run queue to
other CPU runqueues.

When more than one RT task is added to a run queue, this algorithm takes
an assertive approach to push the RT tasks that are not running onto other
run queues that have lower priority.  The way this works is that the highest
RT task that is not running is looked at and we examine the runqueues on
the CPUS for that tasks affinity mask. We find the runqueue with the lowest
prio in the CPU affinity of the picked task, and if it is lower in prio than
the picked task, we push the task onto that CPU runqueue.

We continue pushing RT tasks off the current runqueue until we don't push any
more.  The algorithm stops when the next highest RT task can't preempt any
other processes on other CPUS.

TODO: The algorithm may stop when there are still RT tasks that can be
 migrated. Specifically, if the highest non running RT task CPU affinity
 is restricted to CPUs that are running higher priority tasks, there may
 be a lower priority task queued that has an affinity with a CPU that is
 running a lower priority task that it could be migrated to.  This
 patch set does not address this issue.

Note: checkpatch reveals two over 80 character instances. I'm not sure
 that breaking them up will help visually, so I left them as is.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |   8 +-
 kernel/sched_rt.c | 225 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 231 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa0..97cab60 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	schedule_tail_balance_rt(rq);
+
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
+	int ret = 0;
+
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
+			ret = 1;
 		} else
 			spin_lock(&busiest->lock);
 	}
+	return ret;
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c285..7815e90 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 
 #ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+{
+	struct rt_prio_array *array = &rq->rt.active;
+	struct task_struct *next;
+	struct list_head *queue;
+	int idx;
+
+	assert_spin_locked(&rq->lock);
+
+	if (likely(rq->rt.rt_nr_running < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+	if (unlikely(idx >= MAX_RT_PRIO)) {
+		WARN_ON(1); /* rt_nr_running is bad */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next != rq->curr))
+		return next;
+
+	if (queue->next->next != queue) {
+		/* same prio task */
+		next = list_entry(queue->next->next, struct task_struct, run_list);
+		return next;
+	}
+
+	/* slower, but more flexible */
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+	if (unlikely(idx >= MAX_RT_PRIO)) {
+		WARN_ON(1); /* rt_nr_running was 2 and above! */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	return next;
+}
+
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+				      struct rq *this_rq)
+{
+	struct rq *lowest_rq = NULL;
+	int cpu;
+	int tries;
+	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+
+	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+
+	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+		/*
+		 * Scan each rq for the lowest prio.
+		 */
+		for_each_cpu_mask(cpu, *cpu_mask) {
+			struct rq *rq = &per_cpu(runqueues, cpu);
+
+			if (cpu == this_rq->cpu)
+				continue;
+
+			/* We look for lowest RT prio or non-rt CPU */
+			if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+				lowest_rq = rq;
+				break;
+			}
+
+			/* no locking for now */
+			if (rq->rt.highest_prio > task->prio &&
+			    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+				lowest_rq = rq;
+			}
+		}
+
+		if (!lowest_rq)
+			break;
+
+		/* if the prio of this runqueue changed, try again */
+		if (double_lock_balance(this_rq, lowest_rq)) {
+			/*
+			 * We had to unlock the run queue. In
+			 * the mean time, task could have
+			 * migrated already or had its affinity changed.
+			 * Also make sure that it wasn't scheduled on its rq.
+			 */
+			if (unlikely(task_rq(task) != this_rq ||
+				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+				     task_running(this_rq, task) ||
+				     !task->se.on_rq)) {
+				spin_unlock(&lowest_rq->lock);
+				lowest_rq = NULL;
+				break;
+			}
+		}
+
+		/* If this rq is still suitable use it. */
+		if (lowest_rq->rt.highest_prio > task->prio)
+			break;
+
+		/* try again */
+		spin_unlock(&lowest_rq->lock);
+		lowest_rq = NULL;
+	}
+
+	return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+	struct task_struct *next_task;
+	struct rq *lowest_rq;
+	int ret = 0;
+	int paranoid = RT_MAX_TRIES;
+
+	assert_spin_locked(&this_rq->lock);
+
+	next_task = pick_next_highest_task_rt(this_rq);
+	if (!next_task)
+		return 0;
+
+ retry:
+	if (unlikely(next_task == this_rq->curr))
+		return 0;
+
+	/*
+	 * It's possible that the next_task slipped in of
+	 * higher priority than current. If that's the case
+	 * just reschedule current.
+	 */
+	if (unlikely(next_task->prio < this_rq->curr->prio)) {
+		resched_task(this_rq->curr);
+		return 0;
+	}
+
+	/* We might release this_rq lock */
+	get_task_struct(next_task);
+
+	/* find_lock_lowest_rq locks the rq if found */
+	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+	if (!lowest_rq) {
+		struct task_struct *task;
+		/*
+		 * find lock_lowest_rq releases this_rq->lock
+		 * so it is possible that next_task has changed.
+		 * If it has, then try again.
+		 */
+		task = pick_next_highest_task_rt(this_rq);
+		if (unlikely(task != next_task) && task && paranoid--) {
+			put_task_struct(next_task);
+			next_task = task;
+			goto retry;
+		}
+		goto out;
+	}
+
+	assert_spin_locked(&lowest_rq->lock);
+
+	deactivate_task(this_rq, next_task, 0);
+	set_task_cpu(next_task, lowest_rq->cpu);
+	activate_task(lowest_rq, next_task, 0);
+
+	resched_task(lowest_rq->curr);
+
+	spin_unlock(&lowest_rq->lock);
+
+	ret = 1;
+out:
+	put_task_struct(next_task);
+
+	return ret;
+}
+
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+	/* push_rt_task will return true if it moved an RT */
+	while (push_rt_task(rq))
+		;
+}
+
+static void schedule_tail_balance_rt(struct rq *rq)
+{
+	/*
+	 * If we have more than one rt_task queued, then
+	 * see if we can push the other rt_tasks off to other CPUS.
+	 * Note we may release the rq lock, and since
+	 * the lock was owned by prev, we need to release it
+	 * first via finish_lock_switch and then reaquire it here.
+	 */
+	if (unlikely(rq->rt.rt_nr_running > 1)) {
+		spin_lock_irq(&rq->lock);
+		push_rt_tasks(rq);
+		spin_unlock_irq(&rq->lock);
+	}
+}
+
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
 				  &rt_rq_iterator);
 }
-#endif
+#else /* CONFIG_SMP */
+# define schedule_tail_balance_rt(rq)	do { } while (0)
+#endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
-- 
cgit v1.1


From 4fd29176b7cd24909f8ceba2105cb3ae2857b90c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:06 +0100
Subject: sched: add rt-overload tracking

This patch adds an RT overload accounting system. When a runqueue has
more than one RT task queued, it is marked as overloaded. That is that it
is a candidate to have RT tasks pulled from it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 7815e90..547f858 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,38 @@
  * policies)
  */
 
+#ifdef CONFIG_SMP
+static cpumask_t rt_overload_mask;
+static atomic_t rto_count;
+static inline int rt_overloaded(void)
+{
+	return atomic_read(&rto_count);
+}
+static inline cpumask_t *rt_overload(void)
+{
+	return &rt_overload_mask;
+}
+static inline void rt_set_overload(struct rq *rq)
+{
+	cpu_set(rq->cpu, rt_overload_mask);
+	/*
+	 * Make sure the mask is visible before we set
+	 * the overload count. That is checked to determine
+	 * if we should look at the mask. It would be a shame
+	 * if we looked at the mask, but the mask was not
+	 * updated yet.
+	 */
+	wmb();
+	atomic_inc(&rto_count);
+}
+static inline void rt_clear_overload(struct rq *rq)
+{
+	/* the order here really doesn't matter */
+	atomic_dec(&rto_count);
+	cpu_clear(rq->cpu, rt_overload_mask);
+}
+#endif /* CONFIG_SMP */
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -33,6 +65,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 #ifdef CONFIG_SMP
 	if (p->prio < rq->rt.highest_prio)
 		rq->rt.highest_prio = p->prio;
+	if (rq->rt.rt_nr_running > 1)
+		rt_set_overload(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -54,6 +88,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		} /* otherwise leave rq->highest prio alone */
 	} else
 		rq->rt.highest_prio = MAX_RT_PRIO;
+	if (rq->rt.rt_nr_running < 2)
+		rt_clear_overload(rq);
 #endif /* CONFIG_SMP */
 }
 
-- 
cgit v1.1


From f65eda4f789168ba5ff3fa75546c29efeed19f58 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: pull RT tasks from overloaded runqueues

This patch adds the algorithm to pull tasks from RT overloaded runqueues.

When a pull RT is initiated, all overloaded runqueues are examined for
a RT task that is higher in prio than the highest prio task queued on the
target runqueue. If another runqueue holds a RT task that is of higher
prio than the highest prio task on the target runqueue is found it is pulled
to the target runqueue.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |   2 +
 kernel/sched_rt.c | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 178 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 97cab60..c917971 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
+	schedule_balance_rt(rq, prev);
+
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 547f858..bacb320 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+	if (!task_running(rq, p) &&
+	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+		return 1;
+	return 0;
+}
+
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+						     int cpu)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
 	}
 
 	queue = array->queue + idx;
+	BUG_ON(list_empty(queue));
+
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next != rq->curr))
-		return next;
+	if (unlikely(pick_rt_task(rq, next, cpu)))
+		goto out;
 
 	if (queue->next->next != queue) {
 		/* same prio task */
 		next = list_entry(queue->next->next, struct task_struct, run_list);
-		return next;
+		if (pick_rt_task(rq, next, cpu))
+			goto out;
 	}
 
+ retry:
 	/* slower, but more flexible */
 	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-	if (unlikely(idx >= MAX_RT_PRIO)) {
-		WARN_ON(1); /* rt_nr_running was 2 and above! */
+	if (unlikely(idx >= MAX_RT_PRIO))
 		return NULL;
-	}
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
+	BUG_ON(list_empty(queue));
+
+	list_for_each_entry(next, queue, run_list) {
+		if (pick_rt_task(rq, next, cpu))
+			goto out;
+	}
+
+	goto retry;
 
+ out:
 	return next;
 }
 
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
 
 	assert_spin_locked(&this_rq->lock);
 
-	next_task = pick_next_highest_task_rt(this_rq);
+	next_task = pick_next_highest_task_rt(this_rq, -1);
 	if (!next_task)
 		return 0;
 
  retry:
-	if (unlikely(next_task == this_rq->curr))
+	if (unlikely(next_task == this_rq->curr)) {
+		WARN_ON(1);
 		return 0;
+	}
 
 	/*
 	 * It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(this_rq);
+		task = pick_next_highest_task_rt(this_rq, -1);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
 		;
 }
 
+static int pull_rt_task(struct rq *this_rq)
+{
+	struct task_struct *next;
+	struct task_struct *p;
+	struct rq *src_rq;
+	cpumask_t *rto_cpumask;
+	int this_cpu = this_rq->cpu;
+	int cpu;
+	int ret = 0;
+
+	assert_spin_locked(&this_rq->lock);
+
+	/*
+	 * If cpusets are used, and we have overlapping
+	 * run queue cpusets, then this algorithm may not catch all.
+	 * This is just the price you pay on trying to keep
+	 * dirtying caches down on large SMP machines.
+	 */
+	if (likely(!rt_overloaded()))
+		return 0;
+
+	next = pick_next_task_rt(this_rq);
+
+	rto_cpumask = rt_overload();
+
+	for_each_cpu_mask(cpu, *rto_cpumask) {
+		if (this_cpu == cpu)
+			continue;
+
+		src_rq = cpu_rq(cpu);
+		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
+			/*
+			 * It is possible that overlapping cpusets
+			 * will miss clearing a non overloaded runqueue.
+			 * Clear it now.
+			 */
+			if (double_lock_balance(this_rq, src_rq)) {
+				/* unlocked our runqueue lock */
+				struct task_struct *old_next = next;
+				next = pick_next_task_rt(this_rq);
+				if (next != old_next)
+					ret = 1;
+			}
+			if (likely(src_rq->rt.rt_nr_running <= 1))
+				/*
+				 * Small chance that this_rq->curr changed
+				 * but it's really harmless here.
+				 */
+				rt_clear_overload(this_rq);
+			else
+				/*
+				 * Heh, the src_rq is now overloaded, since
+				 * we already have the src_rq lock, go straight
+				 * to pulling tasks from it.
+				 */
+				goto try_pulling;
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq)) {
+			struct task_struct *old_next = next;
+			next = pick_next_task_rt(this_rq);
+			if (next != old_next)
+				ret = 1;
+		}
+
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt.rt_nr_running <= 1) {
+			spin_unlock(&src_rq->lock);
+			continue;
+		}
+
+ try_pulling:
+		p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (!next || (p->prio < next->prio))) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->se.on_rq);
+
+			/*
+			 * There's a chance that p is higher in priority
+			 * than what's currently running on its cpu.
+			 * This is just that p is wakeing up and hasn't
+			 * had a chance to schedule. We only pull
+			 * p if it is lower in priority than the
+			 * current task on the run queue or
+			 * this_rq next task is lower in prio than
+			 * the current task on that rq.
+			 */
+			if (p->prio < src_rq->curr->prio ||
+			    (next && next->prio < src_rq->curr->prio))
+				goto bail;
+
+			ret = 1;
+
+			deactivate_task(src_rq, p, 0);
+			set_task_cpu(p, this_cpu);
+			activate_task(this_rq, p, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 */
+
+			/*
+			 * Update next so that we won't pick a task
+			 * on another cpu with a priority lower (or equal)
+			 * than the one we just picked.
+			 */
+			next = p;
+
+		}
+ bail:
+		spin_unlock(&src_rq->lock);
+	}
+
+	return ret;
+}
+
+static void schedule_balance_rt(struct rq *rq,
+				struct task_struct *prev)
+{
+	/* Try to pull RT tasks here if we lower this rq's prio */
+	if (unlikely(rt_task(prev)) &&
+	    rq->rt.highest_prio > prev->prio)
+		pull_rt_task(rq);
+}
+
 static void schedule_tail_balance_rt(struct rq *rq)
 {
 	/*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
+# define schedule_balance_rt(rq, prev)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.1


From 4642dafdf93dc7d66ee33437b93a5e6b8cea20d2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: push RT tasks from overloaded CPUs

This patch adds pushing of overloaded RT tasks from a runqueue that is
having tasks (most likely RT tasks) added to the run queue.

TODO: We don't cover the case of waking of new RT tasks (yet).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  1 +
 kernel/sched_rt.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c917971..357d3a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1710,6 +1710,7 @@ out_activate:
 
 out_running:
 	p->state = TASK_RUNNING;
+	wakeup_balance_rt(rq, p);
 out:
 	task_rq_unlock(rq, &flags);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bacb320..d38a8a5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -558,6 +558,15 @@ static void schedule_tail_balance_rt(struct rq *rq)
 	}
 }
 
+
+static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+{
+	if (unlikely(rt_task(p)) &&
+	    !task_running(rq, p) &&
+	    (p->prio >= rq->curr->prio))
+		push_rt_tasks(rq);
+}
+
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
@@ -665,6 +674,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
+# define wakeup_balance_rt(rq, p)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
-- 
cgit v1.1


From c7a1e46aa9782a947cf2ed506245d43396dbf991 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: disable standard balancer for RT tasks

Since we now take an active approach to load balancing, we don't need to
balance RT tasks via the normal task balancer. In fact, this code was
found to pull RT tasks away from CPUS that the active movement performed,
resulting in large latencies.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 95 +++----------------------------------------------------
 1 file changed, 4 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d38a8a5..c492fd2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -567,109 +567,22 @@ static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
 		push_rt_tasks(rq);
 }
 
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *load_balance_start_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
-
-	head = array->queue + idx;
-	curr = head->prev;
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_idx = idx;
-	rq->rt.rt_load_balance_head = head;
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
-static struct task_struct *load_balance_next_rt(void *arg)
-{
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
-
-	idx = rq->rt.rt_load_balance_idx;
-	head = rq->rt.rt_load_balance_head;
-	curr = rq->rt.rt_load_balance_curr;
-
-	/*
-	 * If we arrived back to the head again then
-	 * iterate to the next queue (if any):
-	 */
-	if (unlikely(head == curr)) {
-		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-
-		if (next_idx >= MAX_RT_PRIO)
-			return NULL;
-
-		idx = next_idx;
-		head = array->queue + idx;
-		curr = head->prev;
-
-		rq->rt.rt_load_balance_idx = idx;
-		rq->rt.rt_load_balance_head = head;
-	}
-
-	p = list_entry(curr, struct task_struct, run_list);
-
-	curr = curr->prev;
-
-	rq->rt.rt_load_balance_curr = curr;
-
-	return p;
-}
-
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		unsigned long max_load_move,
 		struct sched_domain *sd, enum cpu_idle_type idle,
 		int *all_pinned, int *this_best_prio)
 {
-	struct rq_iterator rt_rq_iterator;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	/* pass 'busiest' rq argument into
-	 * load_balance_[start|next]_rt iterators
-	 */
-	rt_rq_iterator.arg = busiest;
-
-	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
-			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
+	/* don't touch RT tasks */
+	return 0;
 }
 
 static int
 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 struct sched_domain *sd, enum cpu_idle_type idle)
 {
-	struct rq_iterator rt_rq_iterator;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	rt_rq_iterator.arg = busiest;
-
-	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
-				  &rt_rq_iterator);
+	/* don't touch RT tasks */
+	return 0;
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
-- 
cgit v1.1


From 73fe6aae84400e2b475e2a1dc4e8592cd3ed6e69 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:07 +0100
Subject: sched: add RT-balance cpu-weight

Some RT tasks (particularly kthreads) are bound to one specific CPU.
It is fairly common for two or more bound tasks to get queued up at the
same time.  Consider, for instance, softirq_timer and softirq_sched.  A
timer goes off in an ISR which schedules softirq_thread to run at RT50.
Then the timer handler determines that it's time to smp-rebalance the
system so it schedules softirq_sched to run.  So we are in a situation
where we have two RT50 tasks queued, and the system will go into
rt-overload condition to request other CPUs for help.

This causes two problems in the current code:

1) If a high-priority bound task and a low-priority unbounded task queue
   up behind the running task, we will fail to ever relocate the unbounded
   task because we terminate the search on the first unmovable task.

2) We spend precious futile cycles in the fast-path trying to pull
   overloaded tasks over.  It is therefore optimial to strive to avoid the
   overhead all together if we can cheaply detect the condition before
   overload even occurs.

This patch tries to achieve this optimization by utilizing the hamming
weight of the task->cpus_allowed mask.  A weight of 1 indicates that
the task cannot be migrated.  We will then utilize this information to
skip non-migratable tasks and to eliminate uncessary rebalance attempts.

We introduce a per-rq variable to count the number of migratable tasks
that are currently running.  We only go into overload if we have more
than one rt task, AND at least one of them is migratable.

In addition, we introduce a per-task variable to cache the cpus_allowed
weight, since the hamming calculation is probably relatively expensive.
We only update the cached value when the mask is updated which should be
relatively infrequent, especially compared to scheduling frequency
in the fast path.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c     |  1 +
 kernel/sched.c    |  9 ++++++++-
 kernel/sched_rt.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 54 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 09c0b90..930c518 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1242,6 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index 357d3a0..66e99b4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -343,6 +343,7 @@ struct rt_rq {
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+	unsigned long rt_nr_migratory;
 	/* highest queued rt task prio */
 	int highest_prio;
 };
@@ -5144,7 +5145,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		goto out;
 	}
 
-	p->cpus_allowed = new_mask;
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, &new_mask);
+	else {
+		p->cpus_allowed    = new_mask;
+		p->nr_cpus_allowed = cpus_weight(new_mask);
+	}
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c492fd2..ae4995c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -33,6 +33,14 @@ static inline void rt_clear_overload(struct rq *rq)
 	atomic_dec(&rto_count);
 	cpu_clear(rq->cpu, rt_overload_mask);
 }
+
+static void update_rt_migration(struct rq *rq)
+{
+	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
+		rt_set_overload(rq);
+	else
+		rt_clear_overload(rq);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -65,8 +73,10 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 #ifdef CONFIG_SMP
 	if (p->prio < rq->rt.highest_prio)
 		rq->rt.highest_prio = p->prio;
-	if (rq->rt.rt_nr_running > 1)
-		rt_set_overload(rq);
+	if (p->nr_cpus_allowed > 1)
+		rq->rt.rt_nr_migratory++;
+
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -88,8 +98,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		} /* otherwise leave rq->highest prio alone */
 	} else
 		rq->rt.highest_prio = MAX_RT_PRIO;
-	if (rq->rt.rt_nr_running < 2)
-		rt_clear_overload(rq);
+	if (p->nr_cpus_allowed > 1)
+		rq->rt.rt_nr_migratory--;
+
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 }
 
@@ -182,7 +194,8 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+	    (p->nr_cpus_allowed > 1))
 		return 1;
 	return 0;
 }
@@ -584,6 +597,32 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	/* don't touch RT tasks */
 	return 0;
 }
+static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
+{
+	int weight = cpus_weight(*new_mask);
+
+	BUG_ON(!rt_task(p));
+
+	/*
+	 * Update the migration status of the RQ if we have an RT task
+	 * which is running AND changing its weight value.
+	 */
+	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
+		struct rq *rq = task_rq(p);
+
+		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+			rq->rt.rt_nr_migratory++;
+		else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+			BUG_ON(!rq->rt.rt_nr_migratory);
+			rq->rt.rt_nr_migratory--;
+		}
+
+		update_rt_migration(rq);
+	}
+
+	p->cpus_allowed    = *new_mask;
+	p->nr_cpus_allowed = weight;
+}
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
@@ -637,6 +676,7 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
+	.set_cpus_allowed       = set_cpus_allowed_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
-- 
cgit v1.1


From 697f0a487f294e634a342764472b79375bb3158a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:09 +0100
Subject: sched: clean up this_rq use in kernel/sched_rt.c

"this_rq" is normally used to denote the RQ on the current cpu
(i.e. "cpu_rq(this_cpu)").  So clean up the usage of this_rq to be
more consistent with the rest of the code.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ae4995c..b788e35 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -328,21 +328,21 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
  * running task can migrate over to a CPU that is running a task
  * of lesser priority.
  */
-static int push_rt_task(struct rq *this_rq)
+static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
 	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
-	assert_spin_locked(&this_rq->lock);
+	assert_spin_locked(&rq->lock);
 
-	next_task = pick_next_highest_task_rt(this_rq, -1);
+	next_task = pick_next_highest_task_rt(rq, -1);
 	if (!next_task)
 		return 0;
 
  retry:
-	if (unlikely(next_task == this_rq->curr)) {
+	if (unlikely(next_task == rq->curr)) {
 		WARN_ON(1);
 		return 0;
 	}
@@ -352,24 +352,24 @@ static int push_rt_task(struct rq *this_rq)
 	 * higher priority than current. If that's the case
 	 * just reschedule current.
 	 */
-	if (unlikely(next_task->prio < this_rq->curr->prio)) {
-		resched_task(this_rq->curr);
+	if (unlikely(next_task->prio < rq->curr->prio)) {
+		resched_task(rq->curr);
 		return 0;
 	}
 
-	/* We might release this_rq lock */
+	/* We might release rq lock */
 	get_task_struct(next_task);
 
 	/* find_lock_lowest_rq locks the rq if found */
-	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+	lowest_rq = find_lock_lowest_rq(next_task, rq);
 	if (!lowest_rq) {
 		struct task_struct *task;
 		/*
-		 * find lock_lowest_rq releases this_rq->lock
+		 * find lock_lowest_rq releases rq->lock
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(this_rq, -1);
+		task = pick_next_highest_task_rt(rq, -1);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
@@ -380,7 +380,7 @@ static int push_rt_task(struct rq *this_rq)
 
 	assert_spin_locked(&lowest_rq->lock);
 
-	deactivate_task(this_rq, next_task, 0);
+	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
 
-- 
cgit v1.1


From e7693a362ec84bb5b6fd441d8a8b4b9d568a7a0c Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:09 +0100
Subject: sched: de-SCHED_OTHER-ize the RT path

The current wake-up code path tries to determine if it can optimize the
wake-up to "this_cpu" by computing load calculations.  The problem is that
these calculations are only relevant to SCHED_OTHER tasks where load is king.
For RT tasks, priority is king.  So the load calculation is completely wasted
bandwidth.

Therefore, we create a new sched_class interface to help with
pre-wakeup routing decisions and move the load calculation as a function
of CFS task's class.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 167 ++++++++----------------------------------------
 kernel/sched_fair.c     | 148 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_idletask.c |   9 +++
 kernel/sched_rt.c       |  10 +++
 4 files changed, 194 insertions(+), 140 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 66e99b4..3344ba7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -960,6 +960,13 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_avg_load_per_task(int cpu);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#endif /* CONFIG_SMP */
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1118,7 +1125,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 /*
  * Is this task likely cache-hot:
  */
-static inline int
+static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
@@ -1343,7 +1350,7 @@ static unsigned long target_load(int cpu, int type)
 /*
  * Return the average load per task on the cpu's run queue
  */
-static inline unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
@@ -1500,58 +1507,6 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	cpumask_t tmp;
-	struct sched_domain *sd;
-	int i;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
-			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-							se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -1573,8 +1528,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	long old_state;
 	struct rq *rq;
 #ifdef CONFIG_SMP
-	struct sched_domain *sd, *this_sd = NULL;
-	unsigned long load, this_load;
 	int new_cpu;
 #endif
 
@@ -1594,90 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	new_cpu = cpu;
-
-	schedstat_inc(rq, ttwu_count);
-	if (cpu == this_cpu) {
-		schedstat_inc(rq, ttwu_local);
-		goto out_set_cpu;
-	}
-
-	for_each_domain(this_cpu, sd) {
-		if (cpu_isset(cpu, sd->span)) {
-			schedstat_inc(sd, ttwu_wake_remote);
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-		goto out_set_cpu;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (this_sd) {
-		int idx = this_sd->wake_idx;
-		unsigned int imbalance;
-
-		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-		load = source_load(cpu, idx);
-		this_load = target_load(this_cpu, idx);
-
-		new_cpu = this_cpu; /* Wake to this CPU if we can */
-
-		if (this_sd->flags & SD_WAKE_AFFINE) {
-			unsigned long tl = this_load;
-			unsigned long tl_per_task;
-
-			/*
-			 * Attract cache-cold tasks on sync wakeups:
-			 */
-			if (sync && !task_hot(p, rq->clock, this_sd))
-				goto out_set_cpu;
-
-			schedstat_inc(p, se.nr_wakeups_affine_attempts);
-			tl_per_task = cpu_avg_load_per_task(this_cpu);
-
-			/*
-			 * If sync wakeup then subtract the (maximum possible)
-			 * effect of the currently running task from the load
-			 * of the current CPU:
-			 */
-			if (sync)
-				tl -= current->se.load.weight;
-
-			if ((tl <= load &&
-				tl + target_load(cpu, idx) <= tl_per_task) ||
-			       100*(tl + p->se.load.weight) <= imbalance*load) {
-				/*
-				 * This domain has SD_WAKE_AFFINE and
-				 * p is cache cold in this domain, and
-				 * there is no bad imbalance.
-				 */
-				schedstat_inc(this_sd, ttwu_move_affine);
-				schedstat_inc(p, se.nr_wakeups_affine);
-				goto out_set_cpu;
-			}
-		}
-
-		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
-		 */
-		if (this_sd->flags & SD_WAKE_BALANCE) {
-			if (imbalance*this_load <= 100*load) {
-				schedstat_inc(this_sd, ttwu_move_balance);
-				schedstat_inc(p, se.nr_wakeups_passive);
-				goto out_set_cpu;
-			}
-		}
-	}
-
-	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
-	new_cpu = wake_idle(new_cpu, p);
+	new_cpu = p->sched_class->select_task_rq(p, sync);
 	if (new_cpu != cpu) {
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
@@ -1693,6 +1563,23 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
+#ifdef CONFIG_SCHEDSTATS
+	schedstat_inc(rq, ttwu_count);
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+
+#endif
+
+
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c208e0..f881fc5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,6 +861,151 @@ static void yield_task_fair(struct rq *rq)
 }
 
 /*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	/*
+	 * If it is idle, then it is the best cpu to run this task.
+	 *
+	 * This cpu is also the best, if it has more than one task already.
+	 * Siblings must be also busy(in most cases) as they didn't already
+	 * pickup the extra load from this cpu and hence we need not check
+	 * sibling runqueue info. This will avoid the checks and cache miss
+	 * penalities associated with that.
+	 */
+	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+						       se.nr_wakeups_idle);
+					}
+					return i;
+				}
+			}
+		} else {
+			break;
+		}
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+	return cpu;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+	int cpu, this_cpu;
+	struct rq *rq;
+	struct sched_domain *sd, *this_sd = NULL;
+	int new_cpu;
+
+	cpu      = task_cpu(p);
+	rq       = task_rq(p);
+	this_cpu = smp_processor_id();
+	new_cpu  = cpu;
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			this_sd = sd;
+			break;
+		}
+	}
+
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	/*
+	 * Check for affine wakeup and passive balancing possibilities.
+	 */
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
+		unsigned long load, this_load;
+
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
+
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
+			unsigned long tl_per_task;
+
+			/*
+			 * Attract cache-cold tasks on sync wakeups:
+			 */
+			if (sync && !task_hot(p, rq->clock, this_sd))
+				goto out_set_cpu;
+
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
+			tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+			/*
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
+			 */
+			if (sync)
+				tl -= current->se.load.weight;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+			       100*(tl + p->se.load.weight) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+
+
+/*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -1153,6 +1298,9 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_fair,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_wakeup,
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c..ca53748 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
  *  handled in sched_fair.c)
  */
 
+#ifdef CONFIG_SMP
+static int select_task_rq_idle(struct task_struct *p, int sync)
+{
+	return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
 /*
  * Idle tasks are unconditionally rescheduled:
  */
@@ -72,6 +78,9 @@ const struct sched_class idle_sched_class = {
 
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_idle,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_idle,
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b788e35..5de1aeb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -150,6 +150,13 @@ yield_task_rt(struct rq *rq)
 	requeue_task_rt(rq, rq->curr);
 }
 
+#ifdef CONFIG_SMP
+static int select_task_rq_rt(struct task_struct *p, int sync)
+{
+	return task_cpu(p);
+}
+#endif /* CONFIG_SMP */
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -667,6 +674,9 @@ const struct sched_class rt_sched_class = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_rt,
+#endif /* CONFIG_SMP */
 
 	.check_preempt_curr	= check_preempt_curr_rt,
 
-- 
cgit v1.1


From 07b4032c9e505e2a1fbe7703aff64a153c3249be Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: break out search for RT tasks

Isolate the search logic into a function so that it can be used later
in places other than find_locked_lowest_rq().

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 66 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5de1aeb..ffd0272 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -263,54 +263,66 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-				      struct rq *this_rq)
+static int find_lowest_rq(struct task_struct *task)
 {
-	struct rq *lowest_rq = NULL;
 	int cpu;
-	int tries;
 	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+	struct rq *lowest_rq = NULL;
 
 	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
 
-	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
-		/*
-		 * Scan each rq for the lowest prio.
-		 */
-		for_each_cpu_mask(cpu, *cpu_mask) {
-			struct rq *rq = &per_cpu(runqueues, cpu);
+	/*
+	 * Scan each rq for the lowest prio.
+	 */
+	for_each_cpu_mask(cpu, *cpu_mask) {
+		struct rq *rq = cpu_rq(cpu);
 
-			if (cpu == this_rq->cpu)
-				continue;
+		if (cpu == rq->cpu)
+			continue;
 
-			/* We look for lowest RT prio or non-rt CPU */
-			if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-				lowest_rq = rq;
-				break;
-			}
+		/* We look for lowest RT prio or non-rt CPU */
+		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+			lowest_rq = rq;
+			break;
+		}
 
-			/* no locking for now */
-			if (rq->rt.highest_prio > task->prio &&
-			    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
-				lowest_rq = rq;
-			}
+		/* no locking for now */
+		if (rq->rt.highest_prio > task->prio &&
+		    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+			lowest_rq = rq;
 		}
+	}
+
+	return lowest_rq ? lowest_rq->cpu : -1;
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+				      struct rq *rq)
+{
+	struct rq *lowest_rq = NULL;
+	int cpu;
+	int tries;
 
-		if (!lowest_rq)
+	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+		cpu = find_lowest_rq(task);
+
+		if (cpu == -1)
 			break;
 
+		lowest_rq = cpu_rq(cpu);
+
 		/* if the prio of this runqueue changed, try again */
-		if (double_lock_balance(this_rq, lowest_rq)) {
+		if (double_lock_balance(rq, lowest_rq)) {
 			/*
 			 * We had to unlock the run queue. In
 			 * the mean time, task could have
 			 * migrated already or had its affinity changed.
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
-			if (unlikely(task_rq(task) != this_rq ||
+			if (unlikely(task_rq(task) != rq ||
 				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
-				     task_running(this_rq, task) ||
+				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
-- 
cgit v1.1


From 2de0b4639f4b1b6bfe31f795e5855f041f177170 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: RT balancing: include current CPU

It doesn't hurt if we allow the current CPU to be included in the
search.  We will just simply skip it later if the current CPU turns out
to be the lowest.

We will use this later in the series

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ffd0272..95f36f6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -277,9 +277,6 @@ static int find_lowest_rq(struct task_struct *task)
 	for_each_cpu_mask(cpu, *cpu_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
-		if (cpu == rq->cpu)
-			continue;
-
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 			lowest_rq = rq;
@@ -307,7 +304,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 		cpu = find_lowest_rq(task);
 
-		if (cpu == -1)
+		if ((cpu == -1) || (cpu == rq->cpu))
 			break;
 
 		lowest_rq = cpu_rq(cpu);
-- 
cgit v1.1


From 318e0893ce3f524ca045f9fd9dfd567c0a6f9446 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:10 +0100
Subject: sched: pre-route RT tasks on wakeup

In the original patch series that Steven Rostedt and I worked on together,
we both took different approaches to low-priority wakeup path.  I utilized
"pre-routing" (push the task away to a less important RQ before activating)
approach, while Steve utilized a "post-routing" approach.  The advantage of
my approach is that you avoid the overhead of a wasted activate/deactivate
cycle and peripherally related burdens.  The advantage of Steve's method is
that it neatly solves an issue preventing a "pull" optimization from being
deployed.

In the end, we ended up deploying Steve's idea.  But it later dawned on me
that we could get the best of both worlds by deploying both ideas together,
albeit slightly modified.

The idea is simple:  Use a "light-weight" lookup for pre-routing, since we
only need to approximate a good home for the task.  And we also retain the
post-routing push logic to clean up any inaccuracies caused by a condition
of "priority mistargeting" caused by the lightweight lookup.  Most of the
time, the pre-routing should work and yield lower overhead.  In the cases
where it doesnt, the post-router will bat cleanup.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 95f36f6..ac7d067 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -151,8 +151,27 @@ yield_task_rt(struct rq *rq)
 }
 
 #ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
 static int select_task_rq_rt(struct task_struct *p, int sync)
 {
+	struct rq *rq = task_rq(p);
+
+	/*
+	 * If the task will not preempt the RQ, try to find a better RQ
+	 * before we even activate the task
+	 */
+	if ((p->prio >= rq->rt.highest_prio)
+	    && (p->nr_cpus_allowed > 1)) {
+		int cpu = find_lowest_rq(p);
+
+		return (cpu == -1) ? task_cpu(p) : cpu;
+	}
+
+	/*
+	 * Otherwise, just let it ride on the affined RQ and the
+	 * post-schedule router will push the preempted task away
+	 */
 	return task_cpu(p);
 }
 #endif /* CONFIG_SMP */
-- 
cgit v1.1


From 6e1254d2c41215da27025add8900ed187bca121d Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:11 +0100
Subject: sched: optimize RT affinity

The current code base assumes a relatively flat CPU/core topology and will
route RT tasks to any CPU fairly equally.  In the real world, there are
various toplogies and affinities that govern where a task is best suited to
run with the smallest amount of overhead.  NUMA and multi-core CPUs are
prime examples of topologies that can impact cache performance.

Fortunately, linux is already structured to represent these topologies via
the sched_domains interface.  So we change our RT router to consult a
combination of topology and affinity policy to best place tasks during
migration.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 88 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ac7d067..a9d7d44 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -281,35 +281,111 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 }
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
-static int find_lowest_rq(struct task_struct *task)
+static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-	int cpu;
-	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
-	struct rq *lowest_rq = NULL;
+	int       cpu;
+	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
+	int       lowest_prio = -1;
+	int       ret         = 0;
 
-	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+	cpus_clear(*lowest_mask);
+	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
 	 */
-	for_each_cpu_mask(cpu, *cpu_mask) {
+	for_each_cpu_mask(cpu, *valid_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			lowest_rq = rq;
-			break;
+			if (ret)
+				cpus_clear(*lowest_mask);
+			cpu_set(rq->cpu, *lowest_mask);
+			return 1;
 		}
 
 		/* no locking for now */
-		if (rq->rt.highest_prio > task->prio &&
-		    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
-			lowest_rq = rq;
+		if ((rq->rt.highest_prio > task->prio)
+		    && (rq->rt.highest_prio >= lowest_prio)) {
+			if (rq->rt.highest_prio > lowest_prio) {
+				/* new low - clear old data */
+				lowest_prio = rq->rt.highest_prio;
+				cpus_clear(*lowest_mask);
+			}
+			cpu_set(rq->cpu, *lowest_mask);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+{
+	int first;
+
+	/* "this_cpu" is cheaper to preempt than a remote processor */
+	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+		return this_cpu;
+
+	first = first_cpu(*mask);
+	if (first != NR_CPUS)
+		return first;
+
+	return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+	struct sched_domain *sd;
+	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+	int this_cpu = smp_processor_id();
+	int cpu      = task_cpu(task);
+
+	if (!find_lowest_cpus(task, lowest_mask))
+		return -1;
+
+	/*
+	 * At this point we have built a mask of cpus representing the
+	 * lowest priority tasks in the system.  Now we want to elect
+	 * the best one based on our affinity and topology.
+	 *
+	 * We prioritize the last cpu that the task executed on since
+	 * it is most likely cache-hot in that location.
+	 */
+	if (cpu_isset(cpu, *lowest_mask))
+		return cpu;
+
+	/*
+	 * Otherwise, we consult the sched_domains span maps to figure
+	 * out which cpu is logically closest to our hot cache data.
+	 */
+	if (this_cpu == cpu)
+		this_cpu = -1; /* Skip this_cpu opt if the same */
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_AFFINE) {
+			cpumask_t domain_mask;
+			int       best_cpu;
+
+			cpus_and(domain_mask, sd->span, *lowest_mask);
+
+			best_cpu = pick_optimal_cpu(this_cpu,
+						    &domain_mask);
+			if (best_cpu != -1)
+				return best_cpu;
 		}
 	}
 
-	return lowest_rq ? lowest_rq->cpu : -1;
+	/*
+	 * And finally, if there were no matches within the domains
+	 * just give the caller *something* to work with from the compatible
+	 * locations.
+	 */
+	return pick_optimal_cpu(this_cpu, lowest_mask);
 }
 
 /* Will lock the rq it finds */
-- 
cgit v1.1


From a22d7fc187ed996b66d8439db27b2303f79a8e7b Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:12 +0100
Subject: sched: wake-balance fixes

We have logic to detect whether the system has migratable tasks, but we are
not using it when deciding whether to push tasks away.  So we add support
for considering this new information.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  2 ++
 kernel/sched_rt.c | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3344ba7..c591abd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -346,6 +346,7 @@ struct rt_rq {
 	unsigned long rt_nr_migratory;
 	/* highest queued rt task prio */
 	int highest_prio;
+	int overloaded;
 };
 
 /*
@@ -6770,6 +6771,7 @@ void __init sched_init(void)
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->rt.highest_prio = MAX_RT_PRIO;
+		rq->rt.overloaded = 0;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a9d7d44..87d7b3f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -16,6 +16,7 @@ static inline cpumask_t *rt_overload(void)
 }
 static inline void rt_set_overload(struct rq *rq)
 {
+	rq->rt.overloaded = 1;
 	cpu_set(rq->cpu, rt_overload_mask);
 	/*
 	 * Make sure the mask is visible before we set
@@ -32,6 +33,7 @@ static inline void rt_clear_overload(struct rq *rq)
 	/* the order here really doesn't matter */
 	atomic_dec(&rto_count);
 	cpu_clear(rq->cpu, rt_overload_mask);
+	rq->rt.overloaded = 0;
 }
 
 static void update_rt_migration(struct rq *rq)
@@ -448,6 +450,9 @@ static int push_rt_task(struct rq *rq)
 
 	assert_spin_locked(&rq->lock);
 
+	if (!rq->rt.overloaded)
+		return 0;
+
 	next_task = pick_next_highest_task_rt(rq, -1);
 	if (!next_task)
 		return 0;
@@ -675,7 +680,7 @@ static void schedule_tail_balance_rt(struct rq *rq)
 	 * the lock was owned by prev, we need to release it
 	 * first via finish_lock_switch and then reaquire it here.
 	 */
-	if (unlikely(rq->rt.rt_nr_running > 1)) {
+	if (unlikely(rq->rt.overloaded)) {
 		spin_lock_irq(&rq->lock);
 		push_rt_tasks(rq);
 		spin_unlock_irq(&rq->lock);
@@ -687,7 +692,8 @@ static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
 {
 	if (unlikely(rt_task(p)) &&
 	    !task_running(rq, p) &&
-	    (p->prio >= rq->curr->prio))
+	    (p->prio >= rq->rt.highest_prio) &&
+	    rq->rt.overloaded)
 		push_rt_tasks(rq);
 }
 
-- 
cgit v1.1


From e1f47d891c0f00769d6d40ac5740f943e998d089 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:12 +0100
Subject: sched: RT-balance, avoid overloading

This patch changes the searching for a run queue by a waking RT task
to try to pick another runqueue if the currently running task
is an RT task.

The reason is that RT tasks behave different than normal
tasks. Preempting a normal task to run a RT task to keep
its cache hot is fine, because the preempted non-RT task
may wait on that same runqueue to run again unless the
migration thread comes along and pulls it off.

RT tasks behave differently. If one is preempted, it makes
an active effort to continue to run. So by having a high
priority task preempt a lower priority RT task, that lower
RT task will then quickly try to run on another runqueue.
This will cause that lower RT task to replace its nice
hot cache (and TLB) with a completely cold one. This is
for the hope that the new high priority RT task will keep
 its cache hot.

Remeber that this high priority RT task was just woken up.
So it may likely have been sleeping for several milliseconds,
and will end up with a cold cache anyway. RT tasks run till
they voluntarily stop, or are preempted by a higher priority
task. This means that it is unlikely that the woken RT task
will have a hot cache to wake up to. So pushing off a lower
RT task is just killing its cache for no good reason.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 87d7b3f..9becc37 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -160,11 +160,23 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	struct rq *rq = task_rq(p);
 
 	/*
-	 * If the task will not preempt the RQ, try to find a better RQ
-	 * before we even activate the task
+	 * If the current task is an RT task, then
+	 * try to see if we can wake this RT task up on another
+	 * runqueue. Otherwise simply start this RT task
+	 * on its current runqueue.
+	 *
+	 * We want to avoid overloading runqueues. Even if
+	 * the RT task is of higher priority than the current RT task.
+	 * RT tasks behave differently than other tasks. If
+	 * one gets preempted, we try to push it off to another queue.
+	 * So trying to keep a preempting RT task on the same
+	 * cache hot CPU will force the running RT task to
+	 * a cold CPU. So we waste all the cache for the lower
+	 * RT task in hopes of saving some of a RT task
+	 * that is just being woken and probably will have
+	 * cold cache anyway.
 	 */
-	if ((p->prio >= rq->rt.highest_prio)
-	    && (p->nr_cpus_allowed > 1)) {
+	if (unlikely(rt_task(rq->curr))) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
-- 
cgit v1.1


From 17b3279b48835eb522d842eae16f541da3729c8a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: break out early if RT task cannot be migrated

We don't need to bother searching if the task cannot be migrated

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9becc37..72c8132 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -176,7 +176,8 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 * that is just being woken and probably will have
 	 * cold cache anyway.
 	 */
-	if (unlikely(rt_task(rq->curr))) {
+	if (unlikely(rt_task(rq->curr)) &&
+	    (p->nr_cpus_allowed > 1)) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
-- 
cgit v1.1


From 06f90dbd7610d51549004ea9c2ada337831eb292 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: RT-balance, optimize

We can cheaply track the number of bits set in the cpumask for the lowest
priority CPUs.  Therefore, compute the mask's weight and use it to skip
the optimal domain search logic when there is only one CPU available.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 72c8132..52d88f1 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -303,7 +303,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 	int       cpu;
 	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
 	int       lowest_prio = -1;
-	int       ret         = 0;
+	int       count       = 0;
 
 	cpus_clear(*lowest_mask);
 	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
@@ -316,7 +316,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			if (ret)
+			if (count)
 				cpus_clear(*lowest_mask);
 			cpu_set(rq->cpu, *lowest_mask);
 			return 1;
@@ -328,14 +328,17 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 			if (rq->rt.highest_prio > lowest_prio) {
 				/* new low - clear old data */
 				lowest_prio = rq->rt.highest_prio;
-				cpus_clear(*lowest_mask);
+				if (count) {
+					cpus_clear(*lowest_mask);
+					count = 0;
+				}
 			}
 			cpu_set(rq->cpu, *lowest_mask);
-			ret = 1;
+			count++;
 		}
 	}
 
-	return ret;
+	return count;
 }
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
@@ -359,9 +362,17 @@ static int find_lowest_rq(struct task_struct *task)
 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
+	int count    = find_lowest_cpus(task, lowest_mask);
 
-	if (!find_lowest_cpus(task, lowest_mask))
-		return -1;
+	if (!count)
+		return -1; /* No targets found */
+
+	/*
+	 * There is no sense in performing an optimal search if only one
+	 * target is found.
+	 */
+	if (count == 1)
+		return first_cpu(*lowest_mask);
 
 	/*
 	 * At this point we have built a mask of cpus representing the
-- 
cgit v1.1


From 610bf05645a7ac6ea104a474e328eeaaea148870 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:13 +0100
Subject: sched: RT-balance, optimize cpu search

This patch removes several cpumask operations by keeping track
of the first of the CPUS that is of the lowest priority. When
the search for the lowest priority runqueue is completed, all
the bits up to the first CPU with the lowest priority runqueue
is cleared.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 52d88f1..61d1988 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -296,29 +296,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 }
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-	int       cpu;
-	cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
 	int       lowest_prio = -1;
+	int       lowest_cpu  = -1;
 	int       count       = 0;
+	int       cpu;
 
-	cpus_clear(*lowest_mask);
-	cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
+	cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
 	 */
-	for_each_cpu_mask(cpu, *valid_mask) {
+	for_each_cpu_mask(cpu, *lowest_mask) {
 		struct rq *rq = cpu_rq(cpu);
 
 		/* We look for lowest RT prio or non-rt CPU */
 		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			if (count)
+			/*
+			 * if we already found a low RT queue
+			 * and now we found this non-rt queue
+			 * clear the mask and set our bit.
+			 * Otherwise just return the queue as is
+			 * and the count==1 will cause the algorithm
+			 * to use the first bit found.
+			 */
+			if (lowest_cpu != -1) {
 				cpus_clear(*lowest_mask);
-			cpu_set(rq->cpu, *lowest_mask);
+				cpu_set(rq->cpu, *lowest_mask);
+			}
 			return 1;
 		}
 
@@ -328,13 +335,29 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 			if (rq->rt.highest_prio > lowest_prio) {
 				/* new low - clear old data */
 				lowest_prio = rq->rt.highest_prio;
-				if (count) {
-					cpus_clear(*lowest_mask);
-					count = 0;
-				}
+				lowest_cpu = cpu;
+				count = 0;
 			}
-			cpu_set(rq->cpu, *lowest_mask);
 			count++;
+		} else
+			cpu_clear(cpu, *lowest_mask);
+	}
+
+	/*
+	 * Clear out all the set bits that represent
+	 * runqueues that were of higher prio than
+	 * the lowest_prio.
+	 */
+	if (lowest_cpu > 0) {
+		/*
+		 * Perhaps we could add another cpumask op to
+		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+		 * Then that could be optimized to use memset and such.
+		 */
+		for_each_cpu_mask(cpu, *lowest_mask) {
+			if (cpu >= lowest_cpu)
+				break;
+			cpu_clear(cpu, *lowest_mask);
 		}
 	}
 
-- 
cgit v1.1


From 0d1311a536a0face6267e7346223f2e68b002018 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:14 +0100
Subject: sched: RT-balance on new task

rt-balance when creating new tasks.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c591abd..36bd8ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1722,6 +1722,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
+	wakeup_balance_rt(rq, p);
 	task_rq_unlock(rq, &flags);
 }
 
-- 
cgit v1.1


From 79064fbf75796c4c6a53e40729dbe52f789a91fd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:14 +0100
Subject: sched: clean up pick_next_highest_task_rt()

clean up pick_next_highest_task_rt().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 61d1988..b8435fd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -242,8 +242,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 }
 
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
-						     int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
@@ -270,7 +269,8 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 
 	if (queue->next->next != queue) {
 		/* same prio task */
-		next = list_entry(queue->next->next, struct task_struct, run_list);
+		next = list_entry(queue->next->next, struct task_struct,
+				  run_list);
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
-- 
cgit v1.1


From 4df64c0bfb7e0e260d10ebc005f7d0ba1308eed7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up find_lock_lowest_rq()

clean up find_lock_lowest_rq().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b8435fd..0749c18 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -438,12 +438,11 @@ static int find_lowest_rq(struct task_struct *task)
 }
 
 /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-				      struct rq *rq)
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 {
 	struct rq *lowest_rq = NULL;
-	int cpu;
 	int tries;
+	int cpu;
 
 	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 		cpu = find_lowest_rq(task);
@@ -462,9 +461,11 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+				     !cpu_isset(lowest_rq->cpu,
+						task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
+
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
 				break;
-- 
cgit v1.1


From deeeccd41bd94a9db133d7b923f9a7479a47305d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up overlong line in kernel/sched_debug.c

clean up overlong line in kernel/sched_debug.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0749c18..b591b89 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -762,6 +762,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	/* don't touch RT tasks */
 	return 0;
 }
+
 static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 {
 	int weight = cpus_weight(*new_mask);
@@ -775,9 +776,9 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
-		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+		if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
-		else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+		} else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
 			BUG_ON(!rq->rt.rt_nr_migratory);
 			rq->rt.rt_nr_migratory--;
 		}
@@ -788,6 +789,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	p->cpus_allowed    = *new_mask;
 	p->nr_cpus_allowed = weight;
 }
+
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
-- 
cgit v1.1


From 84de4274893691aa8c471a1f7336d51e555d23a0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:15 +0100
Subject: sched: clean up kernel/sched_rt.c

clean up whitespace damage and missing comments in kernel/sched_rt.c.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b591b89..1a2d8f0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -4,16 +4,24 @@
  */
 
 #ifdef CONFIG_SMP
+
+/*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
 static cpumask_t rt_overload_mask;
 static atomic_t rto_count;
+
 static inline int rt_overloaded(void)
 {
 	return atomic_read(&rto_count);
 }
+
 static inline cpumask_t *rt_overload(void)
 {
 	return &rt_overload_mask;
 }
+
 static inline void rt_set_overload(struct rq *rq)
 {
 	rq->rt.overloaded = 1;
@@ -28,6 +36,7 @@ static inline void rt_set_overload(struct rq *rq)
 	wmb();
 	atomic_inc(&rto_count);
 }
+
 static inline void rt_clear_overload(struct rq *rq)
 {
 	/* the order here really doesn't matter */
-- 
cgit v1.1


From 6e1938d3ad58c940ec4119d387dd92a787cb238c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:16 +0100
Subject: sched: remove rt_overload()

remove rt_overload() - it's an unnecessary indirection.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1a2d8f0..deff0c7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -17,11 +17,6 @@ static inline int rt_overloaded(void)
 	return atomic_read(&rto_count);
 }
 
-static inline cpumask_t *rt_overload(void)
-{
-	return &rt_overload_mask;
-}
-
 static inline void rt_set_overload(struct rq *rq)
 {
 	rq->rt.overloaded = 1;
@@ -590,7 +585,6 @@ static int pull_rt_task(struct rq *this_rq)
 	struct task_struct *next;
 	struct task_struct *p;
 	struct rq *src_rq;
-	cpumask_t *rto_cpumask;
 	int this_cpu = this_rq->cpu;
 	int cpu;
 	int ret = 0;
@@ -608,9 +602,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	rto_cpumask = rt_overload();
-
-	for_each_cpu_mask(cpu, *rto_cpumask) {
+	for_each_cpu_mask(cpu, rt_overload_mask) {
 		if (this_cpu == cpu)
 			continue;
 
-- 
cgit v1.1


From 00597c3ed78e424bdafff123565c078d8b6088cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:16 +0100
Subject: sched: remove leftover debugging

remove leftover debugging.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index deff0c7..cc38521 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -253,8 +253,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	struct list_head *queue;
 	int idx;
 
-	assert_spin_locked(&rq->lock);
-
 	if (likely(rq->rt.rt_nr_running < 2))
 		return NULL;
 
@@ -500,8 +498,6 @@ static int push_rt_task(struct rq *rq)
 	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
-	assert_spin_locked(&rq->lock);
-
 	if (!rq->rt.overloaded)
 		return 0;
 
@@ -546,8 +542,6 @@ static int push_rt_task(struct rq *rq)
 		goto out;
 	}
 
-	assert_spin_locked(&lowest_rq->lock);
-
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
@@ -589,8 +583,6 @@ static int pull_rt_task(struct rq *this_rq)
 	int cpu;
 	int ret = 0;
 
-	assert_spin_locked(&this_rq->lock);
-
 	/*
 	 * If cpusets are used, and we have overlapping
 	 * run queue cpusets, then this algorithm may not catch all.
-- 
cgit v1.1


From 80bf3171dcdf0f5d236e2e48afe2a95c7ce23879 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:17 +0100
Subject: sched: clean up pull_rt_task()

clean up pull_rt_task().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cc38521..05ada7d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -576,12 +576,9 @@ static void push_rt_tasks(struct rq *rq)
 
 static int pull_rt_task(struct rq *this_rq)
 {
-	struct task_struct *next;
-	struct task_struct *p;
+	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	struct task_struct *p, *next;
 	struct rq *src_rq;
-	int this_cpu = this_rq->cpu;
-	int cpu;
-	int ret = 0;
 
 	/*
 	 * If cpusets are used, and we have overlapping
@@ -608,23 +605,25 @@ static int pull_rt_task(struct rq *this_rq)
 			if (double_lock_balance(this_rq, src_rq)) {
 				/* unlocked our runqueue lock */
 				struct task_struct *old_next = next;
+
 				next = pick_next_task_rt(this_rq);
 				if (next != old_next)
 					ret = 1;
 			}
-			if (likely(src_rq->rt.rt_nr_running <= 1))
+			if (likely(src_rq->rt.rt_nr_running <= 1)) {
 				/*
 				 * Small chance that this_rq->curr changed
 				 * but it's really harmless here.
 				 */
 				rt_clear_overload(this_rq);
-			else
+			} else {
 				/*
 				 * Heh, the src_rq is now overloaded, since
 				 * we already have the src_rq lock, go straight
 				 * to pulling tasks from it.
 				 */
 				goto try_pulling;
+			}
 			spin_unlock(&src_rq->lock);
 			continue;
 		}
@@ -638,6 +637,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 */
 		if (double_lock_balance(this_rq, src_rq)) {
 			struct task_struct *old_next = next;
+
 			next = pick_next_task_rt(this_rq);
 			if (next != old_next)
 				ret = 1;
@@ -674,7 +674,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 */
 			if (p->prio < src_rq->curr->prio ||
 			    (next && next->prio < src_rq->curr->prio))
-				goto bail;
+				goto out;
 
 			ret = 1;
 
@@ -686,9 +686,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 */
-
-			/*
+			 *
 			 * Update next so that we won't pick a task
 			 * on another cpu with a priority lower (or equal)
 			 * than the one we just picked.
@@ -696,7 +694,7 @@ static int pull_rt_task(struct rq *this_rq)
 			next = p;
 
 		}
- bail:
+ out:
 		spin_unlock(&src_rq->lock);
 	}
 
-- 
cgit v1.1


From 7f51f298204ec0528422cd9b23feac12612c5665 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:17 +0100
Subject: sched: clean up schedule_balance_rt()

clean up schedule_balance_rt().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 05ada7d..4d0a60e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -701,12 +701,10 @@ static int pull_rt_task(struct rq *this_rq)
 	return ret;
 }
 
-static void schedule_balance_rt(struct rq *rq,
-				struct task_struct *prev)
+static void schedule_balance_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) &&
-	    rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 		pull_rt_task(rq);
 }
 
-- 
cgit v1.1


From 57d885fea0da0e9541d7730a9e1dcf734981a173 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: add sched-domain roots

We add the notion of a root-domain which will be used later to rescope
global variables to per-domain variables.  Each exclusive cpuset
essentially defines an island domain by fully partitioning the member cpus
from any other cpuset.  However, we currently still maintain some
policy/state as global variables which transcend all cpusets.  Consider,
for instance, rt-overload state.

Whenever a new exclusive cpuset is created, we also create a new
root-domain object and move each cpu member to the root-domain's span.
By default the system creates a single root-domain with all cpus as
members (mimicking the global state we have today).

We add some plumbing for storing class specific data in our root-domain.
Whenever a RQ is switching root-domains (because of repartitioning) we
give each sched_class the opportunity to remove any state from its old
domain and add state to the new one.  This logic doesn't have any clients
yet but it will later in the series.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Christoph Lameter <clameter@sgi.com>
CC: Paul Jackson <pj@sgi.com>
CC: Simon Derr <simon.derr@bull.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 36bd8ff..34b7d72 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -349,6 +349,28 @@ struct rt_rq {
 	int overloaded;
 };
 
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables.  Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset.  Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain {
+	atomic_t refcount;
+	cpumask_t span;
+	cpumask_t online;
+};
+
+static struct root_domain def_root_domain;
+
+#endif
+
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -406,6 +428,7 @@ struct rq {
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
+	struct root_domain  *rd;
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -5550,6 +5573,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
+
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_set(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5600,6 +5632,17 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
+
+	case CPU_DOWN_PREPARE:
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_clear(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
+		break;
 #endif
 	}
 	return NOTIFY_OK;
@@ -5788,11 +5831,69 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 }
 
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	unsigned long flags;
+	const struct sched_class *class;
+
+	spin_lock_irqsave(&rq->lock, flags);
+
+	if (rq->rd) {
+		struct root_domain *old_rd = rq->rd;
+
+		for (class = sched_class_highest; class; class = class->next)
+			if (class->leave_domain)
+				class->leave_domain(rq);
+
+		if (atomic_dec_and_test(&old_rd->refcount))
+			kfree(old_rd);
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	for (class = sched_class_highest; class; class = class->next)
+		if (class->join_domain)
+			class->join_domain(rq);
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void init_rootdomain(struct root_domain *rd, const cpumask_t *map)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	rd->span = *map;
+	cpus_and(rd->online, rd->span, cpu_online_map);
+}
+
+static void init_defrootdomain(void)
+{
+	cpumask_t cpus = CPU_MASK_ALL;
+
+	init_rootdomain(&def_root_domain, &cpus);
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(const cpumask_t *map)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	init_rootdomain(rd, map);
+
+	return rd;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd,
+			      struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -5817,6 +5918,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 
 	sched_domain_debug(sd, cpu);
 
+	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 
@@ -6185,6 +6287,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+	struct root_domain *rd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
@@ -6201,6 +6304,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 
+	rd = alloc_rootdomain(cpu_map);
+	if (!rd) {
+		printk(KERN_WARNING "Cannot alloc root domain\n");
+		return -ENOMEM;
+	}
+
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
@@ -6417,7 +6526,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
-		cpu_attach_domain(sd, i);
+		cpu_attach_domain(sd, rd, i);
 	}
 
 	return 0;
@@ -6475,7 +6584,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	unregister_sched_domain_sysctl();
 
 	for_each_cpu_mask(i, *cpu_map)
-		cpu_attach_domain(NULL, i);
+		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
@@ -6727,6 +6836,10 @@ void __init sched_init(void)
 	int highest_cpu = 0;
 	int i, j;
 
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rt_prio_array *array;
 		struct rq *rq;
@@ -6765,6 +6878,8 @@ void __init sched_init(void)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
+		rq->rd = NULL;
+		rq_attach_root(rq, &def_root_domain);
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
-- 
cgit v1.1


From 637f50851b57a32f7ec67c50fc16f1601ab1a87a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: only balance our RT tasks within our domain

We move the rt-overload data as the first global to per-domain
reclassification.  This limits the scope of overload related cache-line
bouncing to stay with a specified partition instead of affecting all
cpus in the system.

Finally, we limit the scope of find_lowest_cpu searches to the domain
instead of the entire system.  Note that we would always respect domain
boundaries even without this patch, but we first would scan potentially
all cpus before whittling the list down.  Now we can avoid looking at
RQs that are out of scope, again reducing cache-line hits.

Note: In some cases, task->cpus_allowed will effectively reduce our search
to within our domain.  However, I believe there are cases where the
cpus_allowed mask may be all ones and therefore we err on the side of
caution.  If it can be optimized later, so be it.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  7 +++++++
 kernel/sched_rt.c | 57 ++++++++++++++++++++++++++++++-------------------------
 2 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 34b7d72..35ef06c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,13 @@ struct root_domain {
 	atomic_t refcount;
 	cpumask_t span;
 	cpumask_t online;
+
+        /*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_t rto_mask;
+	atomic_t  rto_count;
 };
 
 static struct root_domain def_root_domain;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4d0a60e..b049e51 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -5,22 +5,14 @@
 
 #ifdef CONFIG_SMP
 
-/*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- */
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-
-static inline int rt_overloaded(void)
+static inline int rt_overloaded(struct rq *rq)
 {
-	return atomic_read(&rto_count);
+	return atomic_read(&rq->rd->rto_count);
 }
 
 static inline void rt_set_overload(struct rq *rq)
 {
-	rq->rt.overloaded = 1;
-	cpu_set(rq->cpu, rt_overload_mask);
+	cpu_set(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
 	 * the overload count. That is checked to determine
@@ -29,23 +21,25 @@ static inline void rt_set_overload(struct rq *rq)
 	 * updated yet.
 	 */
 	wmb();
-	atomic_inc(&rto_count);
+	atomic_inc(&rq->rd->rto_count);
 }
 
 static inline void rt_clear_overload(struct rq *rq)
 {
 	/* the order here really doesn't matter */
-	atomic_dec(&rto_count);
-	cpu_clear(rq->cpu, rt_overload_mask);
-	rq->rt.overloaded = 0;
+	atomic_dec(&rq->rd->rto_count);
+	cpu_clear(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
 {
-	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
+	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
 		rt_set_overload(rq);
-	else
+		rq->rt.overloaded = 1;
+	} else {
 		rt_clear_overload(rq);
+		rq->rt.overloaded = 0;
+	}
 }
 #endif /* CONFIG_SMP */
 
@@ -306,7 +300,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 	int       count       = 0;
 	int       cpu;
 
-	cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
+	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 
 	/*
 	 * Scan each rq for the lowest prio.
@@ -580,18 +574,12 @@ static int pull_rt_task(struct rq *this_rq)
 	struct task_struct *p, *next;
 	struct rq *src_rq;
 
-	/*
-	 * If cpusets are used, and we have overlapping
-	 * run queue cpusets, then this algorithm may not catch all.
-	 * This is just the price you pay on trying to keep
-	 * dirtying caches down on large SMP machines.
-	 */
-	if (likely(!rt_overloaded()))
+	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, rt_overload_mask) {
+	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
@@ -811,6 +799,20 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+}
+
 static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
@@ -840,4 +842,7 @@ const struct sched_class rt_sched_class = {
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
+
+	.join_domain            = join_domain_rt,
+	.leave_domain           = leave_domain_rt,
 };
-- 
cgit v1.1


From bdd7c81b4973e72b670eff6b5725bab189b723d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:18 +0100
Subject: sched: fix sched_rt.c:join/leave_domain

fix build bug in sched_rt.c:join/leave_domain and make them only
be included on SMP builds.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b049e51..3ea0cae 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -767,6 +767,20 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	p->nr_cpus_allowed = weight;
 }
 
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+}
+
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)	do { } while (0)
 # define schedule_balance_rt(rq, prev)	do { } while (0)
@@ -799,20 +813,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
-/* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
-{
-	if (rq->rt.overloaded)
-		rt_set_overload(rq);
-}
-
-/* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
-{
-	if (rq->rt.overloaded)
-		rt_clear_overload(rq);
-}
-
 static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
@@ -838,11 +838,10 @@ const struct sched_class rt_sched_class = {
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
+	.join_domain            = join_domain_rt,
+	.leave_domain           = leave_domain_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
-
-	.join_domain            = join_domain_rt,
-	.leave_domain           = leave_domain_rt,
 };
-- 
cgit v1.1


From d7876a08db50895ed9808ede4a259cccf65eba47 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: remove unused JIFFIES_TO_NS() macro

remove unused JIFFIES_TO_NS() macro.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 35ef06c..461ee90 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,10 +96,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
 /*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (NSEC_PER_SEC / HZ))
 
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
-- 
cgit v1.1


From 0eab9146571dfa9b50ea952ec2ab27d591f26b63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: style cleanup, #2

style cleanup of various changes that were done recently.

no code changed:

      text    data     bss     dec     hex filename
     26399    2578      48   29025    7161 sched.o.before
     26399    2578      48   29025    7161 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 461ee90..23b9925 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -235,17 +235,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group = {
-	.se     = init_sched_entity_p,
+	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-#define MIN_GROUP_SHARES       2
+#define MIN_GROUP_SHARES	2
 
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
@@ -352,8 +352,8 @@ struct rt_rq {
 
 /*
  * We add the notion of a root-domain which will be used to define per-domain
- * variables.  Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset.  Whenever a new
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
@@ -365,12 +365,12 @@ struct root_domain {
 	cpumask_t span;
 	cpumask_t online;
 
-        /*
+	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_t rto_mask;
-	atomic_t  rto_count;
+	atomic_t rto_count;
 };
 
 static struct root_domain def_root_domain;
@@ -434,7 +434,7 @@ struct rq {
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
-	struct root_domain  *rd;
+	struct root_domain *rd;
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -5066,7 +5066,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, &new_mask);
 	else {
-		p->cpus_allowed    = new_mask;
+		p->cpus_allowed = new_mask;
 		p->nr_cpus_allowed = cpus_weight(new_mask);
 	}
 
@@ -5847,9 +5847,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		for (class = sched_class_highest; class; class = class->next)
+		for (class = sched_class_highest; class; class = class->next) {
 			if (class->leave_domain)
 				class->leave_domain(rq);
+		}
 
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
@@ -5858,9 +5859,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 
-	for (class = sched_class_highest; class; class = class->next)
+	for (class = sched_class_highest; class; class = class->next) {
 		if (class->join_domain)
 			class->join_domain(rq);
+	}
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -5895,11 +5897,11 @@ static struct root_domain *alloc_rootdomain(const cpumask_t *map)
 }
 
 /*
- * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
-static void cpu_attach_domain(struct sched_domain *sd,
-			      struct root_domain *rd, int cpu)
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -7095,7 +7097,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu)
 		for_each_cpu_mask(i, sdspan)
 			total_load += tg->cfs_rq[i]->load.weight;
 
-		/* Nothing to do if this group has no load  */
+		/* Nothing to do if this group has no load */
 		if (!total_load)
 			continue;
 
-- 
cgit v1.1


From b913176917399e92e6f741672038c73d7ce93be5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:19 +0100
Subject: sched: add credits for RT balancing improvements

add credits for RT balancing improvements.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 23b9925..55c5217 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
  */
 
 #include <linux/mm.h>
-- 
cgit v1.1


From 9ec3b77e11b9398ab40b492c4fde7d8aac04a718 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: no need for 'affine wakeup' balancing

No need to do a check for 'affine wakeup and passive balancing possibilities'
in select_task_rq_fair() when task_cpu(p) == this_cpu.

I guess, this part got missed upon introduction of per-sched_class
select_task_rq() in try_to_wake_up().

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f881fc5..2208692 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -925,6 +925,9 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	this_cpu = smp_processor_id();
 	new_cpu  = cpu;
 
+	if (cpu == this_cpu)
+		goto out_set_cpu;
+
 	for_each_domain(this_cpu, sd) {
 		if (cpu_isset(cpu, sd->span)) {
 			this_sd = sd;
-- 
cgit v1.1


From 5d2f5a616d65e3c08acde3195694c4ab8afbc1b7 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: get rid of 'new_cpu' in try_to_wake_up()

Clean-up try_to_wake_up().

Get rid of the 'new_cpu' variable in try_to_wake_up() [ that's, one
#ifdef section less ].  Also remove a few redundant blank lines.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 55c5217..9d6fb73 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1559,9 +1559,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
-#ifdef CONFIG_SMP
-	int new_cpu;
-#endif
 
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
@@ -1579,9 +1576,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	new_cpu = p->sched_class->select_task_rq(p, sync);
-	if (new_cpu != cpu) {
-		set_task_cpu(p, new_cpu);
+	cpu = p->sched_class->select_task_rq(p, sync);
+	if (cpu != orig_cpu) {
+		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
@@ -1608,10 +1605,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			}
 		}
 	}
-
 #endif
 
-
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
-- 
cgit v1.1


From 4bf0b77158d581c952af237aec79d0604b78fe27 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:21 +0100
Subject: sched: remove do_div() from __sched_slice()

Yanmin Zhang noticed a nice optimization:

  p = l * nr / nl, nl = l/g -> p = g * nr

which eliminates a do_div() from __sched_period().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2208692..10aa6e1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -248,8 +248,8 @@ static u64 __sched_period(unsigned long nr_running)
 	unsigned long nr_latency = sched_nr_latency;
 
 	if (unlikely(nr_running > nr_latency)) {
+		period = sysctl_sched_min_granularity;
 		period *= nr_running;
-		do_div(period, nr_latency);
 	}
 
 	return period;
-- 
cgit v1.1


From 9a897c5a6701bcb6f099f7ca20194999102729fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 25 Jan 2008 21:08:22 +0100
Subject: sched: RT-balance, replace hooks with pre/post schedule and wakeup
 methods

To make the main sched.c code more agnostic to the schedule classes.
Instead of having specific hooks in the schedule code for the RT class
balancing. They are replaced with a pre_schedule, post_schedule
and task_wake_up methods. These methods may be used by any of the classes
but currently, only the sched_rt class implements them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 20 ++++++++++++++++----
 kernel/sched_rt.c | 17 +++++++----------
 2 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9d6fb73..2368a0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,10 @@ out_activate:
 
 out_running:
 	p->state = TASK_RUNNING;
-	wakeup_balance_rt(rq, p);
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 out:
 	task_rq_unlock(rq, &flags);
 
@@ -1748,7 +1751,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
-	wakeup_balance_rt(rq, p);
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 	task_rq_unlock(rq, &flags);
 }
 
@@ -1869,7 +1875,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
-	schedule_tail_balance_rt(rq);
+#ifdef CONFIG_SMP
+	if (current->sched_class->post_schedule)
+		current->sched_class->post_schedule(rq);
+#endif
 
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
@@ -3638,7 +3647,10 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 	}
 
-	schedule_balance_rt(rq, prev);
+#ifdef CONFIG_SMP
+	if (prev->sched_class->pre_schedule)
+		prev->sched_class->pre_schedule(rq, prev);
+#endif
 
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3ea0cae..a5a4510 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -689,14 +689,14 @@ static int pull_rt_task(struct rq *this_rq)
 	return ret;
 }
 
-static void schedule_balance_rt(struct rq *rq, struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
 	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 		pull_rt_task(rq);
 }
 
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
 {
 	/*
 	 * If we have more than one rt_task queued, then
@@ -713,10 +713,9 @@ static void schedule_tail_balance_rt(struct rq *rq)
 }
 
 
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
-	if (unlikely(rt_task(p)) &&
-	    !task_running(rq, p) &&
+	if (!task_running(rq, p) &&
 	    (p->prio >= rq->rt.highest_prio) &&
 	    rq->rt.overloaded)
 		push_rt_tasks(rq);
@@ -780,11 +779,6 @@ static void leave_domain_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 }
-
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)	do { } while (0)
-# define schedule_balance_rt(rq, prev)	do { } while (0)
-# define wakeup_balance_rt(rq, p)	do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
@@ -840,6 +834,9 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.join_domain            = join_domain_rt,
 	.leave_domain           = leave_domain_rt,
+	.pre_schedule		= pre_schedule_rt,
+	.post_schedule		= post_schedule_rt,
+	.task_wake_up		= task_wake_up_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
-- 
cgit v1.1


From cb46984504048db946cd551c261df4e70d59a8ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 25 Jan 2008 21:08:22 +0100
Subject: sched: RT-balance, add new methods to sched_class

Dmitry Adamushko found that the current implementation of the RT
balancing code left out changes to the sched_setscheduler and
rt_mutex_setprio.

This patch addresses this issue by adding methods to the schedule classes
to handle being switched out of (switched_from) and being switched into
(switched_to) a sched_class. Also a method for changing of priorities
is also added (prio_changed).

This patch also removes some duplicate logic between rt_mutex_setprio and
sched_setscheduler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 42 +++++++++++------------
 kernel/sched_fair.c     | 39 ++++++++++++++++++++++
 kernel/sched_idletask.c | 31 +++++++++++++++++
 kernel/sched_rt.c       | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2368a0d..5834c7f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1152,6 +1152,18 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
 
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+				       const struct sched_class *prev_class,
+				       int oldprio, int running)
+{
+	if (prev_class != p->sched_class) {
+		if (prev_class->switched_from)
+			prev_class->switched_from(rq, p, running);
+		p->sched_class->switched_to(rq, p, running);
+	} else
+		p->sched_class->prio_changed(rq, p, oldprio, running);
+}
+
 #ifdef CONFIG_SMP
 
 /*
@@ -4017,6 +4029,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
+	const struct sched_class *prev_class = p->sched_class;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
@@ -4042,18 +4055,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
+
 		enqueue_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -4253,6 +4258,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
+	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
@@ -4346,18 +4352,10 @@ recheck:
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
+
 		activate_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 10aa6e1..dfa18d5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1280,6 +1280,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 }
 
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/*
+	 * We were most likely switched from sched_rt, so
+	 * kick off the schedule if running, otherwise just see
+	 * if we can still preempt the current task.
+	 */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
 /* Account for a task changing its policy or group.
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1318,6 +1354,9 @@ static const struct sched_class fair_sched_class = {
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
+
+	.prio_changed		= prio_changed_fair,
+	.switched_to		= switched_to_fair,
 };
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ca53748..ef7a266 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -69,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
 {
 }
 
+static void switched_to_idle(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/* Can this actually happen?? */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
+static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/* This can happen for hot plug CPUS */
+
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -94,5 +121,9 @@ const struct sched_class idle_sched_class = {
 
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
+
+	.prio_changed		= prio_changed_idle,
+	.switched_to		= switched_to_idle,
+
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a5a4510..57fa3d9 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -779,7 +779,92 @@ static void leave_domain_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 }
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	/*
+	 * If there are other RT tasks then we will reschedule
+	 * and the scheduling of the other RT tasks will handle
+	 * the balancing. But if we are the last RT task
+	 * we may need to handle the pulling of RT tasks
+	 * now.
+	 */
+	if (!rq->rt.rt_nr_running)
+		pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	int check_resched = 1;
+
+	/*
+	 * If we are already running, then there's nothing
+	 * that needs to be done. But if we are not running
+	 * we may need to preempt the current running task.
+	 * If that current running task is also an RT task
+	 * then see if we can move to another run queue.
+	 */
+	if (!running) {
+#ifdef CONFIG_SMP
+		if (rq->rt.overloaded && push_rt_task(rq) &&
+		    /* Don't resched if we changed runqueues */
+		    rq != task_rq(p))
+			check_resched = 0;
+#endif /* CONFIG_SMP */
+		if (check_resched && p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+			    int oldprio, int running)
+{
+	if (running) {
+#ifdef CONFIG_SMP
+		/*
+		 * If our priority decreases while running, we
+		 * may need to pull tasks to this runqueue.
+		 */
+		if (oldprio < p->prio)
+			pull_rt_task(rq);
+		/*
+		 * If there's a higher priority task waiting to run
+		 * then reschedule.
+		 */
+		if (p->prio > rq->rt.highest_prio)
+			resched_task(p);
+#else
+		/* For UP simply resched on drop of prio */
+		if (oldprio < p->prio)
+			resched_task(p);
 #endif /* CONFIG_SMP */
+	} else {
+		/*
+		 * This task is not running, but if it is
+		 * greater than the current running task
+		 * then reschedule.
+		 */
+		if (p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
@@ -837,8 +922,12 @@ const struct sched_class rt_sched_class = {
 	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
+	.switched_from		= switched_from_rt,
 #endif
 
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
+
+	.prio_changed		= prio_changed_rt,
+	.switched_to		= switched_to_rt,
 };
-- 
cgit v1.1


From cdc8eb984ce47a7c90a049f45229f7b0d59ba781 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: sched: RT-balance, only adjust overload state when changing

The overload set/clears were originally idempotent when this logic was first
implemented.  But that is no longer true due to the addition of the atomic
counter and this logic was never updated to work properly with that change.
So only adjust the overload state if it is actually changing to avoid
getting out of sync.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 57fa3d9..a386758 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -34,9 +34,11 @@ static inline void rt_clear_overload(struct rq *rq)
 static void update_rt_migration(struct rq *rq)
 {
 	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
-		rt_set_overload(rq);
-		rq->rt.overloaded = 1;
-	} else {
+		if (!rq->rt.overloaded) {
+			rt_set_overload(rq);
+			rq->rt.overloaded = 1;
+		}
+	} else if (rq->rt.overloaded) {
 		rt_clear_overload(rq);
 		rq->rt.overloaded = 0;
 	}
-- 
cgit v1.1


From c49443c538c1bbf50eda27e4a3711e9fc15176b0 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: sched: remove some old cpuset logic

We had support for overlapping cpuset based rto logic in early
prototypes that is no longer used, so remove it.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a386758..9affb3c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -586,38 +586,6 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
-		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
-			/*
-			 * It is possible that overlapping cpusets
-			 * will miss clearing a non overloaded runqueue.
-			 * Clear it now.
-			 */
-			if (double_lock_balance(this_rq, src_rq)) {
-				/* unlocked our runqueue lock */
-				struct task_struct *old_next = next;
-
-				next = pick_next_task_rt(this_rq);
-				if (next != old_next)
-					ret = 1;
-			}
-			if (likely(src_rq->rt.rt_nr_running <= 1)) {
-				/*
-				 * Small chance that this_rq->curr changed
-				 * but it's really harmless here.
-				 */
-				rt_clear_overload(this_rq);
-			} else {
-				/*
-				 * Heh, the src_rq is now overloaded, since
-				 * we already have the src_rq lock, go straight
-				 * to pulling tasks from it.
-				 */
-				goto try_pulling;
-			}
-			spin_unlock(&src_rq->lock);
-			continue;
-		}
-
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
@@ -641,7 +609,6 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 		}
 
- try_pulling:
 		p = pick_next_highest_task_rt(src_rq, this_cpu);
 
 		/*
-- 
cgit v1.1


From c2d727aa2ff17a1c8e5ed1e5e231bb8579b27e82 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:23 +0100
Subject: Preempt-RCU: Use softirq instead of tasklets for

This patch makes RCU use softirq instead of tasklets.

It also adds a memory barrier after raising the softirq
inorder to ensure that the cpu sees the most recently updated
value of rcu->cur while processing callbacks.
The discussion of the related theoretical race pointed out
by James Huang can be found here --> http://lkml.org/lkml/2007/11/20/603

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f2c1a04..4dfa0b7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,8 +73,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
-/* Fake initialization required by compiler */
-static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int blimit = 10;
 static int qhimark = 10000;
 static int qlowmark = 100;
@@ -231,6 +229,18 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+	/*
+	 * The smp_mb() here is required to ensure that this cpu's
+	 * __rcu_process_callbacks() reads the most recently updated
+	 * value of rcu->cur.
+	 */
+	smp_mb();
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -260,7 +270,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
-		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+		raise_rcu_softirq();
 }
 
 /*
@@ -412,7 +422,6 @@ static void rcu_offline_cpu(int cpu)
 					&per_cpu(rcu_bh_data, cpu));
 	put_cpu_var(rcu_data);
 	put_cpu_var(rcu_bh_data);
-	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
 }
 
 #else
@@ -424,7 +433,7 @@ static void rcu_offline_cpu(int cpu)
 #endif
 
 /*
- * This does the RCU processing work from tasklet context. 
+ * This does the RCU processing work from softirq context.
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
@@ -469,7 +478,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		rcu_do_batch(rdp);
 }
 
-static void rcu_process_callbacks(unsigned long unused)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
@@ -533,7 +542,7 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_bh_qsctr_inc(cpu);
 	} else if (!in_softirq())
 		rcu_bh_qsctr_inc(cpu);
-	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
+	raise_rcu_softirq();
 }
 
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
@@ -556,7 +565,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
 }
 
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-- 
cgit v1.1


From 01c1c660f4b8086cad7a62345fd04290f3d82c8f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: reorganize RCU code into rcuclassic.c and rcupdate.c

This patch re-organizes the RCU code to enable multiple implementations
of RCU. Users of RCU continues to include rcupdate.h and the
RCU interfaces remain the same. This is in preparation for
subsequently merging the preemptible RCU implementation.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile     |   2 +-
 kernel/rcuclassic.c | 576 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcupdate.c   | 575 +++------------------------------------------------
 3 files changed, 602 insertions(+), 551 deletions(-)
 create mode 100644 kernel/rcuclassic.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index dfa9695..def5dd6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o extable.o params.o posix-timers.o \
+	    rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
 	    utsname.o notifier.o
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 0000000..18369e3
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,576 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+/* #include <linux/rcupdate.h> @@@ */
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(!rcp->signaled)) {
+		rcp->signaled = 1;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+	/*
+	 * The smp_mb() here is required to ensure that this cpu's
+	 * __rcu_process_callbacks() reads the most recently updated
+	 * value of rcu->cur.
+	 */
+	smp_mb();
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+	rdp->donelist = list;
+
+	local_irq_disable();
+	rdp->qlen -= count;
+	local_irq_enable();
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		raise_rcu_softirq();
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		rcp->next_pending = 0;
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rcp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+		rcp->signaled = 0;
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rcp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp);
+
+	spin_unlock(&rcp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rcp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from softirq context.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_disable();
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_enable();
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcp->lock);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
+		}
+	}
+
+	rcu_check_quiescent_state(rcp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	raise_rcu_softirq();
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+	rdp->blimit = blimit;
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init __rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4dfa0b7..0ccd009 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright (C) IBM Corporation, 2001
+ * Copyright IBM Corporation, 2001
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
@@ -35,163 +35,57 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
-#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/module.h>
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
-
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	cpumask_t cpumask;
-	set_need_resched();
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 */
-		cpumask = rcp->cpumask;
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
-			smp_send_reschedule(cpu);
-	}
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
-	set_need_resched();
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
 }
-#endif
 
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * synchronize_rcu - wait until a grace period has elapsed.
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
  * read-side critical sections have completed.  RCU read-side critical
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void fastcall call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-	local_irq_restore(flags);
-}
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void fastcall call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+void synchronize_rcu(void)
 {
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
-	local_irq_restore(flags);
-}
+	struct rcu_synchronize rcu;
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
-	return rcu_bh_ctrlblk.completed;
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
 }
+EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
@@ -205,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 static void rcu_barrier_func(void *notused)
 {
 	int cpu = smp_processor_id();
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_head *head;
+	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
-	head = &rdp->barrier;
 	atomic_inc(&rcu_barrier_cpu_count);
 	call_rcu(head, rcu_barrier_callback);
 }
@@ -229,425 +121,8 @@ void rcu_barrier(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
-{
-	raise_softirq(RCU_SOFTIRQ);
-	/*
-	 * The smp_mb() here is required to ensure that this cpu's
-	 * __rcu_process_callbacks() reads the most recently updated
-	 * value of rcu->cur.
-	 */
-	smp_mb();
-}
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->next_pending &&
-			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
-		rcp->cur++;
-
-		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
-		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
-
-		rcp->signaled = 0;
-	}
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
-	}
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
-	}
-
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
-	 */
-	if (!rdp->qs_pending)
-		return;
-
-	/* 
-	 * Was there a quiescent state since the beginning of the grace
-	 * period? If no, then exit and wait for the next call.
-	 */
-	if (!rdp->passed_quiesc)
-		return;
-	rdp->qs_pending = 0;
-
-	spin_lock(&rcp->lock);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock(&rcp->lock);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
-{
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
-	 */
-	spin_lock_bh(&rcp->lock);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from softirq context.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
-		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
-
-		/*
-		 * start the next batch of callbacks
-		 */
-
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
-
-		if (!rcp->next_pending) {
-			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
-			spin_unlock(&rcp->lock);
-		}
-	}
-
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
-
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so.  This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
-	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && 
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
-		rcu_bh_qsctr_inc(cpu);
-	raise_rcu_softirq();
-}
-
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
-}
-
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
+	__rcu_init();
 }
 
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-EXPORT_SYMBOL_GPL(call_rcu);
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-- 
cgit v1.1


From e0ecfa7917cafe72f4a75f87e8bb5d8d51dc534f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: fix rcu_barrier for preemptive environment.

Fix rcu_barrier() to work properly in preemptive kernel environment.
Also, the ordering of callback must be preserved while moving
callbacks to another CPU during CPU hotplug.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c |  2 +-
 kernel/rcupdate.c   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 18369e3..ce0cf16 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -371,9 +371,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 }
 
 static void rcu_offline_cpu(int cpu)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0ccd009..760dfc2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -115,7 +115,17 @@ void rcu_barrier(void)
 	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	atomic_set(&rcu_barrier_cpu_count, 0);
+	/*
+	 * The queueing of callbacks in all CPUs must be atomic with
+	 * respect to RCU, otherwise one CPU may queue a callback,
+	 * wait for a grace period, decrement barrier count and call
+	 * complete(), while other CPUs have not yet queued anything.
+	 * So, we need to make sure that grace periods cannot complete
+	 * until all the callbacks are queued.
+	 */
+	rcu_read_lock();
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
-- 
cgit v1.1


From e260be673a15b6125068270e0216a3bfbfc12f87 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:24 +0100
Subject: Preempt-RCU: implementation

This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -

	http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf

and the article-

	http://lwn.net/Articles/253651/

This patch was developed as a part of the -rt kernel development and
meant to provide better latencies when read-side critical sections of
RCU don't disable preemption.  As a consequence of keeping track of RCU
readers, the readers have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU implementations
and can be switched to at compiler.

Also includes RCU tracing summarized in debugfs.

[ akpm@linux-foundation.org: build fixes on non-preempt architectures ]

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt    |  10 +
 kernel/Makefile           |   7 +-
 kernel/fork.c             |   4 +
 kernel/rcuclassic.c       |   1 -
 kernel/rcupreempt.c       | 816 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcupreempt_trace.c | 330 +++++++++++++++++++
 6 files changed, 1166 insertions(+), 2 deletions(-)
 create mode 100644 kernel/rcupreempt.c
 create mode 100644 kernel/rcupreempt_trace.c

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c..61fa116 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -63,3 +63,13 @@ config PREEMPT_BKL
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config RCU_TRACE
+	bool "Enable tracing for RCU - currently stats in debugfs"
+	select DEBUG_FS
+	default y
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index def5dd6..68755cd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o rcuclassic.o extable.o params.o posix-timers.o \
+	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
 	    utsname.o notifier.o
@@ -52,6 +52,11 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+ifeq ($(CONFIG_PREEMPT_RCU),y)
+obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+endif
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 930c518..9f8ef32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_flipctr_idx = 0;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ce0cf16..f4ffbd0 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -45,7 +45,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-/* #include <linux/rcupdate.h> @@@ */
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 0000000..a5aabb1
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,816 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *		for pushing me away from locks and towards counters, and
+ *		to Suparna Bhattacharya for pushing me completely away
+ *		from atomic instructions on the read side.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * Design Document: http://lwn.net/Articles/253651/
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+#include <linux/rcupreempt_trace.h>
+
+/*
+ * Macro that prevents the compiler from reordering accesses, but does
+ * absolutely -nothing- to prevent CPUs from reordering.  This is used
+ * only to mediate communication between mainline code and hardware
+ * interrupt and NMI handlers.
+ */
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+/*
+ * GP_STAGES specifies the number of times the state machine has
+ * to go through the all the rcu_try_flip_states (see below)
+ * in a single Grace Period.
+ *
+ * GP in GP_STAGES stands for Grace Period ;)
+ */
+#define GP_STAGES    2
+struct rcu_data {
+	spinlock_t	lock;		/* Protect rcu_data fields. */
+	long		completed;	/* Number of last completed batch. */
+	int		waitlistcount;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist[GP_STAGES];
+	struct rcu_head **waittail[GP_STAGES];
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long rcu_flipctr[2];
+#ifdef CONFIG_RCU_TRACE
+	struct rcupreempt_trace trace;
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+
+enum rcu_try_flip_states {
+
+	/*
+	 * Stay here if nothing is happening. Flip the counter if somthing
+	 * starts happening. Denoted by "I"
+	 */
+	rcu_try_flip_idle_state,
+
+	/*
+	 * Wait here for all CPUs to notice that the counter has flipped. This
+	 * prevents the old set of counters from ever being incremented once
+	 * we leave this state, which in turn is necessary because we cannot
+	 * test any individual counter for zero -- we can only check the sum.
+	 * Denoted by "A".
+	 */
+	rcu_try_flip_waitack_state,
+
+	/*
+	 * Wait here for the sum of the old per-CPU counters to reach zero.
+	 * Denoted by "Z".
+	 */
+	rcu_try_flip_waitzero_state,
+
+	/*
+	 * Wait here for each of the other CPUs to execute a memory barrier.
+	 * This is necessary to ensure that these other CPUs really have
+	 * completed executing their RCU read-side critical sections, despite
+	 * their CPUs wildly reordering memory. Denoted by "M".
+	 */
+	rcu_try_flip_waitmb_state,
+};
+
+struct rcu_ctrlblk {
+	spinlock_t	fliplock;	/* Protect state-machine transitions. */
+	long		completed;	/* Number of last completed batch. */
+	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
+							the rcu state machine */
+};
+
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+	.completed = 0,
+	.rcu_try_flip_state = rcu_try_flip_idle_state,
+};
+
+
+#ifdef CONFIG_RCU_TRACE
+static char *rcu_try_flip_state_names[] =
+	{ "idle", "waitack", "waitzero", "waitmb" };
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+
+enum rcu_flip_flag_values {
+	rcu_flip_seen,		/* Steady/initial state, last flip seen. */
+				/* Only GP detector can update. */
+	rcu_flipped		/* Flip just completed, need confirmation. */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
+								= rcu_flip_seen;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+
+enum rcu_mb_flag_values {
+	rcu_mb_done,		/* Steady/initial state, no mb()s required. */
+				/* Only GP detector can update. */
+	rcu_mb_needed		/* Flip just completed, need an mb(). */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
+								= rcu_mb_done;
+
+/*
+ * RCU_DATA_ME: find the current CPU's rcu_data structure.
+ * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
+ */
+#define RCU_DATA_ME()		(&__get_cpu_var(rcu_data))
+#define RCU_DATA_CPU(cpu)	(&per_cpu(rcu_data, cpu))
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable, but where the CPU number is so cached.
+ */
+#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable.
+ */
+#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is pointed
+ * to by a local variable.
+ */
+#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+void __rcu_read_lock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting != 0) {
+
+		/* An earlier rcu_read_lock() covers us, just count it. */
+
+		t->rcu_read_lock_nesting = nesting + 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * We disable interrupts for the following reasons:
+		 * - If we get scheduling clock interrupt here, and we
+		 *   end up acking the counter flip, it's like a promise
+		 *   that we will never increment the old counter again.
+		 *   Thus we will break that promise if that
+		 *   scheduling clock interrupt happens between the time
+		 *   we pick the .completed field and the time that we
+		 *   increment our counter.
+		 *
+		 * - We don't want to be preempted out here.
+		 *
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so increment
+		 * the current counter for the current CPU.  Use volatile
+		 * casts to prevent the compiler from reordering.
+		 */
+
+		idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
+
+		/*
+		 * Now that the per-CPU counter has been incremented, we
+		 * are protected from races with rcu_read_lock() invoked
+		 * from NMI handlers on this CPU.  We can therefore safely
+		 * increment the nesting counter, relieving further NMIs
+		 * of the need to increment the per-CPU counter.
+		 */
+
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
+
+		/*
+		 * Now that we have preventing any NMIs from storing
+		 * to the ->rcu_flipctr_idx, we can safely use it to
+		 * remember which counter to decrement in the matching
+		 * rcu_read_unlock().
+		 */
+
+		ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+void __rcu_read_unlock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting > 1) {
+
+		/*
+		 * We are still protected by the enclosing rcu_read_lock(),
+		 * so simply decrement the counter.
+		 */
+
+		t->rcu_read_lock_nesting = nesting - 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * Disable local interrupts to prevent the grace-period
+		 * detection state machine from seeing us half-done.
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock() and rcu_read_unlock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_unlock(), so we must
+		 * decrement the current counter for the current CPU.
+		 * This must be done carefully, because NMIs can
+		 * occur at any point in this code, and any rcu_read_lock()
+		 * and rcu_read_unlock() pairs in the NMI handlers
+		 * must interact non-destructively with this code.
+		 * Lots of volatile casts, and -very- careful ordering.
+		 *
+		 * Changes to this code, including this one, must be
+		 * inspected, validated, and tested extremely carefully!!!
+		 */
+
+		/*
+		 * First, pick up the index.
+		 */
+
+		idx = ACCESS_ONCE(t->rcu_flipctr_idx);
+
+		/*
+		 * Now that we have fetched the counter index, it is
+		 * safe to decrement the per-task RCU nesting counter.
+		 * After this, any interrupts or NMIs will increment and
+		 * decrement the per-CPU counters.
+		 */
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
+
+		/*
+		 * It is now safe to decrement this task's nesting count.
+		 * NMIs that occur after this statement will route their
+		 * rcu_read_lock() calls through this "else" clause, and
+		 * will thus start incrementing the per-CPU counter on
+		 * their own.  They will also clobber ->rcu_flipctr_idx,
+		 * but that is OK, since we have already fetched it.
+		 */
+
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * If a global counter flip has occurred since the last time that we
+ * advanced callbacks, advance them.  Hardware interrupts must be
+ * disabled when calling this function.
+ */
+static void __rcu_advance_callbacks(struct rcu_data *rdp)
+{
+	int cpu;
+	int i;
+	int wlc = 0;
+
+	if (rdp->completed != rcu_ctrlblk.completed) {
+		if (rdp->waitlist[GP_STAGES - 1] != NULL) {
+			*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
+			rdp->donetail = rdp->waittail[GP_STAGES - 1];
+			RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
+		}
+		for (i = GP_STAGES - 2; i >= 0; i--) {
+			if (rdp->waitlist[i] != NULL) {
+				rdp->waitlist[i + 1] = rdp->waitlist[i];
+				rdp->waittail[i + 1] = rdp->waittail[i];
+				wlc++;
+			} else {
+				rdp->waitlist[i + 1] = NULL;
+				rdp->waittail[i + 1] =
+					&rdp->waitlist[i + 1];
+			}
+		}
+		if (rdp->nextlist != NULL) {
+			rdp->waitlist[0] = rdp->nextlist;
+			rdp->waittail[0] = rdp->nexttail;
+			wlc++;
+			rdp->nextlist = NULL;
+			rdp->nexttail = &rdp->nextlist;
+			RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
+		} else {
+			rdp->waitlist[0] = NULL;
+			rdp->waittail[0] = &rdp->waitlist[0];
+		}
+		rdp->waitlistcount = wlc;
+		rdp->completed = rcu_ctrlblk.completed;
+	}
+
+	/*
+	 * Check to see if this CPU needs to report that it has seen
+	 * the most recent counter flip, thereby declaring that all
+	 * subsequent rcu_read_lock() invocations will respect this flip.
+	 */
+
+	cpu = raw_smp_processor_id();
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+}
+
+/*
+ * Get here when RCU is idle.  Decide whether we need to
+ * move out of idle state, and return non-zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+
+static int
+rcu_try_flip_idle(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
+	if (!rcu_pending(smp_processor_id())) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
+		return 0;
+	}
+
+	/*
+	 * Do the flip.
+	 */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
+	rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
+
+	/*
+	 * Need a memory barrier so that other CPUs see the new
+	 * counter value before they see the subsequent change of all
+	 * the rcu_flip_flag instances to rcu_flipped.
+	 */
+
+	smp_mb();	/* see above block comment. */
+
+	/* Now ask each CPU for acknowledgement of the flip. */
+
+	for_each_possible_cpu(cpu)
+		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+
+	return 1;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+
+static int
+rcu_try_flip_waitack(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
+	for_each_possible_cpu(cpu)
+		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
+			return 0;
+		}
+
+	/*
+	 * Make sure our checks above don't bleed into subsequent
+	 * waiting for the sum of the counters to reach zero.
+	 */
+
+	smp_mb();	/* see above block comment. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
+	return 1;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+
+static int
+rcu_try_flip_waitzero(void)
+{
+	int cpu;
+	int lastidx = !(rcu_ctrlblk.completed & 0x1);
+	int sum = 0;
+
+	/* Check to see if the sum of the "last" counters is zero. */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
+	for_each_possible_cpu(cpu)
+		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
+	if (sum != 0) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
+		return 0;
+	}
+
+	/*
+	 * This ensures that the other CPUs see the call for
+	 * memory barriers -after- the sum to zero has been
+	 * detected here
+	 */
+	smp_mb();  /*  ^^^^^^^^^^^^ */
+
+	/* Call for a memory barrier from each CPU. */
+	for_each_possible_cpu(cpu)
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
+	return 1;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+
+static int
+rcu_try_flip_waitmb(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
+	for_each_possible_cpu(cpu)
+		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
+			return 0;
+		}
+
+	smp_mb(); /* Ensure that the above checks precede any following flip. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
+	return 1;
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * at least GP_STAGES consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+	unsigned long flags;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
+		return;
+	}
+
+	/*
+	 * Take the next transition(s) through the RCU grace-period
+	 * flip-counter state machine.
+	 */
+
+	switch (rcu_ctrlblk.rcu_try_flip_state) {
+	case rcu_try_flip_idle_state:
+		if (rcu_try_flip_idle())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitack_state;
+		break;
+	case rcu_try_flip_waitack_state:
+		if (rcu_try_flip_waitack())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitzero_state;
+		break;
+	case rcu_try_flip_waitzero_state:
+		if (rcu_try_flip_waitzero())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitmb_state;
+		break;
+	case rcu_try_flip_waitmb_state:
+		if (rcu_try_flip_waitmb())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_idle_state;
+	}
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+	if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
+		smp_mb();  /* Ensure RCU read-side accesses are visible. */
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
+	}
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	rcu_check_mb(cpu);
+	if (rcu_ctrlblk.completed == rdp->completed)
+		rcu_try_flip();
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	if (rdp->donelist == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+	} else {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		raise_softirq(RCU_SOFTIRQ);
+	}
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	if (rcu_ctrlblk.completed == rdp->completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rdp->completed)
+			return;
+	}
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+	struct rcu_data *rdp = RCU_DATA_ME();
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	list = rdp->donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		return;
+	}
+	rdp->donelist = NULL;
+	rdp->donetail = &rdp->donelist;
+	RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+		RCU_TRACE_ME(rcupreempt_trace_invoke);
+	}
+}
+
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	__rcu_advance_callbacks(rdp);
+	*rdp->nexttail = head;
+	rdp->nexttail = &head->next;
+	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
+	spin_unlock(&rdp->lock);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete.  Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+void __synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0)
+		oldmask = cpu_possible_map;
+	for_each_online_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  Assumes that notifiers would take care of handling any
+ * outstanding requests from the RCU core.
+ *
+ * This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return (rdp->donelist != NULL ||
+		!!rdp->waitlistcount ||
+		rdp->nextlist != NULL);
+}
+
+int rcu_pending(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	/* The CPU has at least one callback queued somewhere. */
+
+	if (rdp->donelist != NULL ||
+	    !!rdp->waitlistcount ||
+	    rdp->nextlist != NULL)
+		return 1;
+
+	/* The RCU core needs an acknowledgement from this CPU. */
+
+	if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
+	    (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
+		return 1;
+
+	/* This CPU has fallen behind the global grace-period number. */
+
+	if (rdp->completed != rcu_ctrlblk.completed)
+		return 1;
+
+	/* Nothing needed from this CPU. */
+
+	return 0;
+}
+
+void __init __rcu_init(void)
+{
+	int cpu;
+	int i;
+	struct rcu_data *rdp;
+
+	printk(KERN_NOTICE "Preemptible RCU implementation.\n");
+	for_each_possible_cpu(cpu) {
+		rdp = RCU_DATA_CPU(cpu);
+		spin_lock_init(&rdp->lock);
+		rdp->completed = 0;
+		rdp->waitlistcount = 0;
+		rdp->nextlist = NULL;
+		rdp->nexttail = &rdp->nextlist;
+		for (i = 0; i < GP_STAGES; i++) {
+			rdp->waitlist[i] = NULL;
+			rdp->waittail[i] = &rdp->waitlist[i];
+		}
+		rdp->donelist = NULL;
+		rdp->donetail = &rdp->donelist;
+		rdp->rcu_flipctr[0] = 0;
+		rdp->rcu_flipctr[1] = 0;
+	}
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_TRACE
+long *rcupreempt_flipctr(int cpu)
+{
+	return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
+
+int rcupreempt_flip_flag(int cpu)
+{
+	return per_cpu(rcu_flip_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
+
+int rcupreempt_mb_flag(int cpu)
+{
+	return per_cpu(rcu_mb_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
+
+char *rcupreempt_try_flip_state_name(void)
+{
+	return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
+
+struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return &rdp->trace;
+}
+EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
+
+#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 0000000..49ac4947
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
+/*
+ * Read-Copy Update tracing for realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/rcupreempt_trace.h>
+#include <linux/debugfs.h>
+
+static struct mutex rcupreempt_trace_mutex;
+static char *rcupreempt_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
+{
+	trace->done_length += trace->wait_length;
+	trace->done_add += trace->wait_length;
+	trace->wait_length = 0;
+}
+void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
+{
+	trace->wait_length += trace->next_length;
+	trace->wait_add += trace->next_length;
+	trace->next_length = 0;
+}
+void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_1);
+}
+void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_e1);
+}
+void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_i1++;
+}
+void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ie1++;
+}
+void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_g1++;
+}
+void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a1++;
+}
+void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ae1++;
+}
+void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a2++;
+}
+void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z1++;
+}
+void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ze1++;
+}
+void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z2++;
+}
+void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m1++;
+}
+void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_me1++;
+}
+void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m2++;
+}
+void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
+{
+	trace->rcu_check_callbacks++;
+}
+void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
+{
+	trace->done_remove += trace->done_length;
+	trace->done_length = 0;
+}
+void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->done_invoked);
+}
+void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
+{
+	trace->next_add++;
+	trace->next_length++;
+}
+
+static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
+{
+	struct rcupreempt_trace *cp;
+	int cpu;
+
+	memset(sp, 0, sizeof(*sp));
+	for_each_possible_cpu(cpu) {
+		cp = rcupreempt_trace_cpu(cpu);
+		sp->next_length += cp->next_length;
+		sp->next_add += cp->next_add;
+		sp->wait_length += cp->wait_length;
+		sp->wait_add += cp->wait_add;
+		sp->done_length += cp->done_length;
+		sp->done_add += cp->done_add;
+		sp->done_remove += cp->done_remove;
+		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
+		atomic_set(&sp->rcu_try_flip_1,
+			   atomic_read(&cp->rcu_try_flip_1));
+		atomic_set(&sp->rcu_try_flip_e1,
+			   atomic_read(&cp->rcu_try_flip_e1));
+		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
+		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
+		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
+		sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
+		sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
+		sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
+		sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
+		sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
+		sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
+		sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
+		sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
+		sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
+	}
+}
+
+static ssize_t rcustats_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct rcupreempt_trace trace;
+	ssize_t bcount;
+	int cnt = 0;
+
+	rcupreempt_trace_sum(&trace);
+	mutex_lock(&rcupreempt_trace_mutex);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "ggp=%ld rcc=%ld\n",
+		 rcu_batches_completed(),
+		 trace.rcu_check_callbacks);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
+		 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
+
+		 trace.next_add, trace.next_length,
+		 trace.wait_add, trace.wait_length,
+		 trace.done_add, trace.done_length,
+		 trace.done_remove, atomic_read(&trace.done_invoked),
+		 atomic_read(&trace.rcu_try_flip_1),
+		 atomic_read(&trace.rcu_try_flip_e1),
+		 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
+		 trace.rcu_try_flip_g1,
+		 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
+			 trace.rcu_try_flip_a2,
+		 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
+			 trace.rcu_try_flip_z2,
+		 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
+			trace.rcu_try_flip_m2);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	long oldgp = rcu_batches_completed();
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+	synchronize_rcu();
+	snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
+		"oldggp=%ld  newggp=%ld\n", oldgp, rcu_batches_completed());
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_batches_completed() & 0x1;
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+
+	cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
+				"CPU last cur F M\n");
+	for_each_online_cpu(cpu) {
+		long *flipctr = rcupreempt_flipctr(cpu);
+		cnt += snprintf(&rcupreempt_trace_buf[cnt],
+				RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+					"%3d %4ld %3ld %d %d\n",
+			       cpu,
+			       flipctr[!f],
+			       flipctr[f],
+			       rcupreempt_flip_flag(cpu),
+			       rcupreempt_mb_flag(cpu));
+	}
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"ggp = %ld, state = %s\n",
+			rcu_batches_completed(),
+			rcupreempt_try_flip_state_name());
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"\n");
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcustats_fops = {
+	.owner = THIS_MODULE,
+	.read = rcustats_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct file_operations rcuctrs_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuctrs_read,
+};
+
+static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
+static int rcupreempt_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	statdir = debugfs_create_file("rcustats", 0444, rcudir,
+						NULL, &rcustats_fops);
+	if (!statdir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
+						NULL, &rcuctrs_fops);
+	if (!ctrsdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (statdir)
+		debugfs_remove(statdir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcupreempt_trace_init(void)
+{
+	mutex_init(&rcupreempt_trace_mutex);
+	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcupreempt_trace_buf)
+		return 1;
+	return rcupreempt_debugfs_init();
+}
+
+static void __exit rcupreempt_trace_cleanup(void)
+{
+	debugfs_remove(statdir);
+	debugfs_remove(gpdir);
+	debugfs_remove(ctrsdir);
+	debugfs_remove(rcudir);
+	kfree(rcupreempt_trace_buf);
+}
+
+
+module_init(rcupreempt_trace_init);
+module_exit(rcupreempt_trace_cleanup);
-- 
cgit v1.1


From eaf649e9fe6685f4c5a392cd0e16df5fd6660b7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 25 Jan 2008 21:08:25 +0100
Subject: Preempt-RCU: CPU Hotplug handling

This patch allows preemptible RCU to tolerate CPU-hotplug operations.
It accomplishes this by maintaining a local copy of a map of online
CPUs, which it accesses under its own lock.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 142 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index a5aabb1..987cfb7 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,6 +147,8 @@ static char *rcu_try_flip_state_names[] =
 	{ "idle", "waitack", "waitzero", "waitmb" };
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
+static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+
 /*
  * Enum and per-CPU flag to determine when each CPU has seen
  * the most recent counter flip.
@@ -445,7 +447,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 
 	return 1;
@@ -461,7 +463,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 			return 0;
@@ -492,7 +494,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -507,7 +509,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
@@ -525,7 +527,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_possible_cpu(cpu)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
 		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 			return 0;
@@ -637,6 +639,98 @@ void rcu_advance_callbacks(int cpu, int user)
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
+		*dsttail = srclist; \
+		if (srclist != NULL) { \
+			dsttail = srctail; \
+			srclist = NULL; \
+			srctail = &srclist;\
+		} \
+	} while (0)
+
+void rcu_offline_cpu(int cpu)
+{
+	int i;
+	struct rcu_head *list = NULL;
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head **tail = &list;
+
+	/*
+	 * Remove all callbacks from the newly dead CPU, retaining order.
+	 * Otherwise rcu_barrier() will fail
+	 */
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
+	for (i = GP_STAGES - 1; i >= 0; i--)
+		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
+						list, tail);
+	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	rdp->waitlistcount = 0;
+
+	/* Disengage the newly dead CPU from the grace-period computation. */
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	rcu_check_mb(cpu);
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+
+	RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+	RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
+
+	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
+	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
+
+	cpu_clear(cpu, rcu_cpu_online_map);
+
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * Place the removed callbacks on the current CPU's queue.
+	 * Make them all start a new grace period: simple approach,
+	 * in theory could starve a given set of callbacks, but
+	 * you would need to be doing some serious CPU hotplugging
+	 * to make this happen.  If this becomes a problem, adding
+	 * a synchronize_rcu() to the hotplug path would be a simple
+	 * fix.
+	 */
+
+	rdp = RCU_DATA_ME();
+	spin_lock_irqsave(&rdp->lock, flags);
+	*rdp->nexttail = list;
+	if (list)
+		rdp->nexttail = tail;
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	cpu_set(cpu, rcu_cpu_online_map);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+void rcu_offline_cpu(int cpu)
+{
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	unsigned long flags;
@@ -746,6 +840,32 @@ int rcu_pending(int cpu)
 	return 0;
 }
 
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call = rcu_cpu_notify,
+};
+
 void __init __rcu_init(void)
 {
 	int cpu;
@@ -769,6 +889,23 @@ void __init __rcu_init(void)
 		rdp->rcu_flipctr[0] = 0;
 		rdp->rcu_flipctr[1] = 0;
 	}
+	register_cpu_notifier(&rcu_nb);
+
+	/*
+	 * We don't need protection against CPU-Hotplug here
+	 * since
+	 * a) If a CPU comes online while we are iterating over the
+	 *    cpu_online_map below, we would only end up making a
+	 *    duplicate call to rcu_online_cpu() which sets the corresponding
+	 *    CPU's mask in the rcu_cpu_online_map.
+	 *
+	 * b) A CPU cannot go offline at this point in time since the user
+	 *    does not have access to the sysfs interface, nor do we
+	 *    suspend the system.
+	 */
+	for_each_online_cpu(cpu)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
+
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
 }
 
-- 
cgit v1.1


From dc938520d2bf343b239795cfa24e4f44649358dc Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 25 Jan 2008 21:08:26 +0100
Subject: sched: dynamically update the root-domain span/online maps

The baseline code statically builds the span maps when the domain is formed.
Previous attempts at dynamically updating the maps caused a suspend-to-ram
regression, which should now be fixed.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5834c7f..02d4688 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -359,8 +359,6 @@ struct rt_rq {
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
  */
 struct root_domain {
 	atomic_t refcount;
@@ -375,6 +373,10 @@ struct root_domain {
 	atomic_t rto_count;
 };
 
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
 static struct root_domain def_root_domain;
 
 #endif
@@ -5859,6 +5861,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 				class->leave_domain(rq);
 		}
 
+		cpu_clear(rq->cpu, old_rd->span);
+		cpu_clear(rq->cpu, old_rd->online);
+
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
 	}
@@ -5866,6 +5871,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 
+	cpu_set(rq->cpu, rd->span);
+	if (cpu_isset(rq->cpu, cpu_online_map))
+		cpu_set(rq->cpu, rd->online);
+
 	for (class = sched_class_highest; class; class = class->next) {
 		if (class->join_domain)
 			class->join_domain(rq);
@@ -5874,23 +5883,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static void init_rootdomain(struct root_domain *rd, const cpumask_t *map)
+static void init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 
-	rd->span = *map;
-	cpus_and(rd->online, rd->span, cpu_online_map);
+	cpus_clear(rd->span);
+	cpus_clear(rd->online);
 }
 
 static void init_defrootdomain(void)
 {
-	cpumask_t cpus = CPU_MASK_ALL;
-
-	init_rootdomain(&def_root_domain, &cpus);
+	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 
-static struct root_domain *alloc_rootdomain(const cpumask_t *map)
+static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 
@@ -5898,7 +5905,7 @@ static struct root_domain *alloc_rootdomain(const cpumask_t *map)
 	if (!rd)
 		return NULL;
 
-	init_rootdomain(rd, map);
+	init_rootdomain(rd);
 
 	return rd;
 }
@@ -6319,7 +6326,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 
-	rd = alloc_rootdomain(cpu_map);
+	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 		return -ENOMEM;
@@ -6894,7 +6901,6 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
-		rq_attach_root(rq, &def_root_domain);
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
@@ -6903,6 +6909,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->rt.highest_prio = MAX_RT_PRIO;
 		rq->rt.overloaded = 0;
+		rq_attach_root(rq, &def_root_domain);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-- 
cgit v1.1


From 8eb703e4f33488bf75829564d51d427e17f7cd4c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jan 2008 21:08:26 +0100
Subject: uids: merge multiple error paths in alloc_uid() into one

There are already 4 error paths in alloc_uid() that do incremental rollbacks.
I think it's time to merge them.  This costs us 8 lines of code :)

Maybe it would be better to merge this patch with the previous one, but I
remember that some time ago I sent a similar patch (fixing the error path and
cleaning it), but I was told to make two patches in such cases.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index ab4fd70..bc1c48d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -319,7 +319,7 @@ void free_uid(struct user_struct *up)
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
 	struct hlist_head *hashent = uidhashentry(ns, uid);
-	struct user_struct *up;
+	struct user_struct *up, *new;
 
 	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	 * atomic.
@@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	spin_unlock_irq(&uidhash_lock);
 
 	if (!up) {
-		struct user_struct *new;
-
 		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
-		if (!new) {
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (!new)
+			goto out_unlock;
 
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
@@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 #endif
 		new->locked_shm = 0;
 
-		if (alloc_uid_keyring(new, current) < 0) {
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (alloc_uid_keyring(new, current) < 0)
+			goto out_free_user;
 
-		if (sched_create_user(new) < 0) {
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (sched_create_user(new) < 0)
+			goto out_put_keys;
 
-		if (uids_user_create(new)) {
-			sched_destroy_user(new);
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (uids_user_create(new))
+			goto out_destoy_sched;
 
 		/*
 		 * Before adding this, check whether we raced
@@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	uids_mutex_unlock();
 
 	return up;
+
+out_destoy_sched:
+	sched_destroy_user(new);
+out_put_keys:
+	key_put(new->uid_keyring);
+	key_put(new->session_keyring);
+out_free_user:
+	kmem_cache_free(uid_cachep, new);
+out_unlock:
+	uids_mutex_unlock();
+	return NULL;
 }
 
 void switch_uid(struct user_struct *new_user)
-- 
cgit v1.1


From fa717060f1ab7eb6570f2fb49136f838fc9195a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:27 +0100
Subject: sched: sched_rt_entity

Move the task_struct members specific to rt scheduling together.
A future optimization could be to put sched_entity and sched_rt_entity
into a union.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  2 +-
 kernel/sched_rt.c | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 02d4688..c2cedd0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1685,7 +1685,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_max			= 0;
 #endif
 
-	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9affb3c..29963af 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -111,7 +111,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_add_tail(&p->run_list, array->queue + p->prio);
+	list_add_tail(&p->rt.run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	inc_cpu_load(rq, p->se.load.weight);
 
@@ -127,7 +127,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 
-	list_del(&p->run_list);
+	list_del(&p->rt.run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 	dec_cpu_load(rq, p->se.load.weight);
@@ -143,7 +143,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	list_move_tail(&p->rt.run_list, array->queue + p->prio);
 }
 
 static void
@@ -212,7 +212,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 		return NULL;
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, rt.run_list);
 
 	next->se.exec_start = rq->clock;
 
@@ -261,14 +261,14 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	queue = array->queue + idx;
 	BUG_ON(list_empty(queue));
 
-	next = list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, rt.run_list);
 	if (unlikely(pick_rt_task(rq, next, cpu)))
 		goto out;
 
 	if (queue->next->next != queue) {
 		/* same prio task */
 		next = list_entry(queue->next->next, struct task_struct,
-				  run_list);
+				  rt.run_list);
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
@@ -282,7 +282,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 	queue = array->queue + idx;
 	BUG_ON(list_empty(queue));
 
-	list_for_each_entry(next, queue, run_list) {
+	list_for_each_entry(next, queue, rt.run_list) {
 		if (pick_rt_task(rq, next, cpu))
 			goto out;
 	}
@@ -846,16 +846,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	if (p->policy != SCHED_RR)
 		return;
 
-	if (--p->time_slice)
+	if (--p->rt.time_slice)
 		return;
 
-	p->time_slice = DEF_TIMESLICE;
+	p->rt.time_slice = DEF_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
 	 * on the queue:
 	 */
-	if (p->run_list.prev != p->run_list.next) {
+	if (p->rt.run_list.prev != p->rt.run_list.next) {
 		requeue_task_rt(rq, p);
 		set_tsk_need_resched(p);
 	}
-- 
cgit v1.1


From 78f2c7db6068fd6ef75b8c120f04a388848eacb5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:27 +0100
Subject: sched: SCHED_FIFO/SCHED_RR watchdog timer

Introduce a new rlimit that allows the user to set a runtime timeout on
real-time tasks their slice. Once this limit is exceeded the task will receive
SIGXCPU.

So it measures runtime since the last sleep.

Input and ideas by Thomas Gleixner and Lennart Poettering.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Lennart Poettering <mzxreary@0pointer.de>
CC: Michael Kerrisk <mtk.manpages@googlemail.com>
CC: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 29 +++++++++++++++++++++++++++++
 kernel/sched_rt.c         | 30 ++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c9637..2c076b3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
+	struct signal_struct *const sig = tsk->signal;
 
 	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,34 @@ static void check_thread_timers(struct task_struct *tsk,
 		t->firing = 1;
 		list_move_tail(&t->entry, firing);
 	}
+
+	/*
+	 * Check for the special case thread timers.
+	 */
+	if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+
+		if (tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the hard limit, we just die.
+			 * No need to calculate anything else now.
+			 */
+			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+			return;
+		}
+		if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the soft limit, send a SIGXCPU every second.
+			 */
+			if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+			    < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+				sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+								USEC_PER_SEC;
+			}
+			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+		}
+	}
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 29963af..f350f7b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -116,6 +116,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	inc_cpu_load(rq, p->se.load.weight);
 
 	inc_rt_tasks(p, rq);
+
+	if (wakeup)
+		p->rt.timeout = 0;
 }
 
 /*
@@ -834,11 +837,38 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 	}
 }
 
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+	unsigned long soft, hard;
+
+	if (!p->signal)
+		return;
+
+	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+
+	if (soft != RLIM_INFINITY) {
+		unsigned long next;
+
+		p->rt.timeout++;
+		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+		if (next > p->rt.timeout) {
+			u64 next_time = p->se.sum_exec_runtime;
+
+			next_time += next * (NSEC_PER_SEC/HZ);
+			if (p->it_sched_expires > next_time)
+				p->it_sched_expires = next_time;
+		} else
+			p->it_sched_expires = p->se.sum_exec_runtime;
+	}
+}
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 
+	watchdog(rq, p);
+
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
-- 
cgit v1.1


From 03319ec8b06849051747a17aa2a0f9aba9277980 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:28 +0100
Subject: sched: documentation, whitespace fixes

whitespace fixes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c2cedd0..b9ee0f4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -182,7 +182,7 @@ struct task_group {
 	 *
 	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
 	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
-	 * 	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
 	 *
 	 * The weight assigned to a task group's schedulable entities on every
 	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
@@ -192,9 +192,9 @@ struct task_group {
 	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
 	 *
 	 * Note: It's not necessary that each of a task's group schedulable
-	 * 	 entity have the same weight on all CPUs. If the group
-	 * 	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
-	 * 	 better distribution of weight could be:
+	 *	 entity have the same weight on all CPUs. If the group
+	 *	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 *	 better distribution of weight could be:
 	 *
 	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
 	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
-- 
cgit v1.1


From 02b67cc3ba36bdba351d6c3a00593f4ec550d9d3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 25 Jan 2008 21:08:28 +0100
Subject: sched: do not do cond_resched() when CONFIG_PREEMPT

Why do we even have cond_resched when real preemption
is on? It seems to be a waste of space and time.

remove cond_resched with CONFIG_PREEMPT on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b9ee0f4..6ee3760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4678,7 +4678,8 @@ static void __cond_resched(void)
 	} while (need_resched());
 }
 
-int __sched cond_resched(void)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+int __sched _cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
@@ -4687,7 +4688,8 @@ int __sched cond_resched(void)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(cond_resched);
+EXPORT_SYMBOL(_cond_resched);
+#endif
 
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
-- 
cgit v1.1


From 8f4d37ec073c17e2d4aa8851df5837d798606d6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:29 +0100
Subject: sched: high-res preemption tick

Use HR-timers (when available) to deliver an accurate preemption tick.

The regular scheduler tick that runs at 1/HZ can be too coarse when nice
level are used. The fairness system will still keep the cpu utilisation 'fair'
by then delaying the task that got an excessive amount of CPU time but try to
minimize this by delivering preemption points spot-on.

The average frequency of this extra interrupt is sched_latency / nr_latency.
Which need not be higher than 1/HZ, its just that the distribution within the
sched_latency period is important.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz       |   2 +
 kernel/sched.c          | 210 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c     |  69 +++++++++++++++-
 kernel/sched_idletask.c |   2 +-
 kernel/sched_rt.c       |   2 +-
 5 files changed, 268 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802..526128a 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
+config SCHED_HRTICK
+	def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee3760..17f93d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -65,6 +65,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -451,6 +452,12 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -572,6 +579,8 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_APPROX_AVG		= 16,
+	SCHED_FEAT_HRTICK		= 32,
+	SCHED_FEAT_DOUBLE_TICK		= 64,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -579,7 +588,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0;
+		SCHED_FEAT_APPROX_AVG		* 0 |
+		SCHED_FEAT_HRTICK		* 1 |
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -796,6 +807,173 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else
+		hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+void hrtick_resched(void)
+{
+}
+#endif
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -809,16 +987,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -841,10 +1019,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
-static inline void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(p, tif_bit);
 }
 #endif
 
@@ -3497,7 +3675,7 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+		curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3643,6 +3821,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	hrtick_clear(rq);
+
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3680,14 +3860,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -6913,6 +7099,8 @@ void __init sched_init(void)
 		rq->rt.overloaded = 0;
 		rq_attach_root(rq, &def_root_domain);
 #endif
+		init_rq_hrtick(rq);
+
 		atomic_set(&rq->nr_iowait, 0);
 
 		array = &rq->rt.active;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dfa18d5..3dab1ff 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -642,13 +642,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 }
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -754,6 +770,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -782,6 +835,8 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	if (incload)
 		inc_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -814,6 +869,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 	 */
 	if (decload)
 		dec_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -1049,6 +1106,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -1060,7 +1118,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 
 /*
@@ -1235,14 +1296,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 }
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ef7a266..2bcafa3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -61,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #endif
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f350f7b..83fbbcb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -863,7 +863,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 	}
 }
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	update_curr_rt(rq);
 
-- 
cgit v1.1


From fa85ae2418e6843953107cd6a06f645752829bc0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:29 +0100
Subject: sched: rt time limit

Very simple time limit on the realtime scheduling classes.
Allow the rq's realtime class to consume sched_rt_ratio of every
sched_rt_period slice. If the class exceeds this quota the fair class
will preempt the realtime class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 70 +++++++++++++++++++++++++++++++++++++++----------------
 kernel/sched_rt.c | 53 +++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c   | 18 +++++++++++++-
 3 files changed, 120 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 17f93d3..e9a7bee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -342,13 +342,14 @@ struct cfs_rq {
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
-	int rt_load_balance_idx;
-	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 	unsigned long rt_nr_running;
+#ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	/* highest queued rt task prio */
-	int highest_prio;
+	int highest_prio; /* highest queued rt task prio */
 	int overloaded;
+#endif
+	u64 rt_time;
+	u64 rt_throttled;
 };
 
 #ifdef CONFIG_SMP
@@ -415,6 +416,7 @@ struct rq {
 	struct list_head leaf_cfs_rq_list;
 #endif
 	struct rt_rq rt;
+	u64 rt_period_expire;
 
 	/*
 	 * This is part of a global counter where only the total sum
@@ -601,6 +603,21 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
+ * period over which we measure -rt task cpu usage in ms.
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_rt_period = 1000;
+
+#define SCHED_RT_FRAC_SHIFT	16
+#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
+
+/*
+ * ratio of time -rt tasks may consume.
+ * default: 100%
+ */
+const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+
+/*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
@@ -3674,8 +3691,8 @@ void scheduler_tick(void)
 		rq->clock = next_tick;
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
-	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr, 0);
+	curr->sched_class->task_tick(rq, curr, 0);
+	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -7041,6 +7058,29 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 
+static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+	struct rt_prio_array *array;
+	int i;
+
+	array = &rt_rq->active;
+	for (i = 0; i < MAX_RT_PRIO; i++) {
+		INIT_LIST_HEAD(array->queue + i);
+		__clear_bit(i, array->bitmap);
+	}
+	/* delimiter for bitsearch: */
+	__set_bit(MAX_RT_PRIO, array->bitmap);
+
+#ifdef CONFIG_SMP
+	rt_rq->rt_nr_migratory = 0;
+	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->overloaded = 0;
+#endif
+
+	rt_rq->rt_time = 0;
+	rt_rq->rt_throttled = 0;
+}
+
 void __init sched_init(void)
 {
 	int highest_cpu = 0;
@@ -7051,7 +7091,6 @@ void __init sched_init(void)
 #endif
 
 	for_each_possible_cpu(i) {
-		struct rt_prio_array *array;
 		struct rq *rq;
 
 		rq = cpu_rq(i);
@@ -7083,6 +7122,8 @@ void __init sched_init(void)
 		}
 		init_task_group.shares = init_task_group_load;
 #endif
+		init_rt_rq(&rq->rt, rq);
+		rq->rt_period_expire = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7095,22 +7136,11 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
-		rq->rt.highest_prio = MAX_RT_PRIO;
-		rq->rt.overloaded = 0;
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
-
 		atomic_set(&rq->nr_iowait, 0);
-
-		array = &rq->rt.active;
-		for (j = 0; j < MAX_RT_PRIO; j++) {
-			INIT_LIST_HEAD(array->queue + j);
-			__clear_bit(j, array->bitmap);
-		}
 		highest_cpu = i;
-		/* delimiter for bitsearch: */
-		__set_bit(MAX_RT_PRIO, array->bitmap);
 	}
 
 	set_load_weight(&init_task);
@@ -7282,7 +7312,7 @@ void set_curr_task(int cpu, struct task_struct *p)
 #ifdef CONFIG_SMP
 /*
  * distribute shares of all task groups among their schedulable entities,
- * to reflect load distrbution across cpus.
+ * to reflect load distribution across cpus.
  */
 static int rebalance_shares(struct sched_domain *sd, int this_cpu)
 {
@@ -7349,7 +7379,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu)
  * sysctl_sched_max_bal_int_shares represents the maximum interval between
  * consecutive calls to rebalance_shares() in the same sched domain.
  *
- * These settings allows for the appropriate tradeoff between accuracy of
+ * These settings allows for the appropriate trade-off between accuracy of
  * fairness and the associated overhead.
  *
  */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 83fbbcb..fd10d96 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,6 +45,50 @@ static void update_rt_migration(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
+{
+	u64 period, ratio;
+
+	if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
+		return 0;
+
+	if (rt_rq->rt_throttled)
+		return 1;
+
+	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	if (rt_rq->rt_time > ratio) {
+		rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
+		return 1;
+	}
+
+	return 0;
+}
+
+static void update_sched_rt_period(struct rq *rq)
+{
+	while (rq->clock > rq->rt_period_expire) {
+		u64 period, ratio;
+
+		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+		ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+		rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
+		rq->rt_period_expire += period;
+	}
+
+	/*
+	 * When the rt throttle is expired, let them rip.
+	 * (XXX: use hrtick when available)
+	 */
+	if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
+		rq->rt.rt_throttled = 0;
+		if (!sched_rt_ratio_exceeded(rq, &rq->rt))
+			resched_task(rq->curr);
+	}
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -66,6 +110,11 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
+
+	rq->rt.rt_time += delta_exec;
+	update_sched_rt_period(rq);
+	if (sched_rt_ratio_exceeded(rq, &rq->rt))
+		resched_task(curr);
 }
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -208,8 +257,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
 	struct list_head *queue;
+	struct rt_rq *rt_rq = &rq->rt;
 	int idx;
 
+	if (sched_rt_ratio_exceeded(rq, rt_rq))
+		return NULL;
+
 	idx = sched_find_first_bit(array->bitmap);
 	if (idx >= MAX_RT_PRIO)
 		return NULL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 96f31c1..3afbd25 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -306,7 +306,23 @@ static struct ctl_table kern_table[] = {
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
-		.mode		= 644,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_ms",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_ratio",
+		.data		= &sysctl_sched_rt_ratio,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-- 
cgit v1.1


From 6f505b16425a51270058e4a93441fe64de3dd435 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:30 +0100
Subject: sched: rt group scheduling

Extend group scheduling to also cover the realtime classes. It uses the time
limiting introduced by the previous patch to allow multiple realtime groups.

The hard time limit is required to keep behaviour deterministic.

The algorithms used make the realtime scheduler O(tg), linear scaling wrt the
number of task groups. This is the worst case behaviour I can't seem to get out
of, the avg. case of the algorithms can be improved, I focused on correctness
and worst case.

[ akpm@linux-foundation.org: move side-effects out of BUG_ON(). ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c     |   2 +-
 kernel/sched.c    | 283 +++++++++++++++++++++++----------
 kernel/sched_rt.c | 455 ++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 537 insertions(+), 203 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9f8ef32..0c969f4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,7 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	p->nr_cpus_allowed = current->nr_cpus_allowed;
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a7bee..5ea2c53 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -161,6 +161,8 @@ struct rt_prio_array {
 
 struct cfs_rq;
 
+static LIST_HEAD(task_groups);
+
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -171,6 +173,11 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 
+	struct sched_rt_entity **rt_se;
+	struct rt_rq **rt_rq;
+
+	unsigned int rt_ratio;
+
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
 	 * is allocated to the group. The more shares a group has, the more is
@@ -208,6 +215,7 @@ struct task_group {
 	unsigned long shares;
 
 	struct rcu_head rcu;
+	struct list_head list;
 };
 
 /* Default task group's sched entity on each cpu */
@@ -215,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
+static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
+static struct rt_rq *init_rt_rq_p[NR_CPUS];
+
 /* task_group_mutex serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
@@ -240,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares);
 struct task_group init_task_group = {
 	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
+
+	.rt_se	= init_sched_rt_entity_p,
+	.rt_rq	= init_rt_rq_p,
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
@@ -269,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p)
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
+
+	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+	p->rt.parent = task_group(p)->rt_se[cpu];
 }
 
 static inline void lock_task_group_list(void)
@@ -297,7 +317,7 @@ static inline void unlock_doms_cur(void)
 
 #else
 
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline void lock_task_group_list(void) { }
 static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
@@ -343,13 +363,22 @@ struct cfs_rq {
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	int highest_prio; /* highest queued rt task prio */
+#endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	int highest_prio; /* highest queued rt task prio */
 	int overloaded;
 #endif
+	int rt_throttled;
 	u64 rt_time;
-	u64 rt_throttled;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq *rq;
+	struct list_head leaf_rt_rq_list;
+	struct task_group *tg;
+	struct sched_rt_entity *rt_se;
+#endif
 };
 
 #ifdef CONFIG_SMP
@@ -411,12 +440,14 @@ struct rq {
 	u64 nr_switches;
 
 	struct cfs_rq cfs;
+	struct rt_rq rt;
+	u64 rt_period_expire;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+	struct list_head leaf_rt_rq_list;
 #endif
-	struct rt_rq rt;
-	u64 rt_period_expire;
 
 	/*
 	 * This is part of a global counter where only the total sum
@@ -613,9 +644,9 @@ const_debug unsigned int sysctl_sched_rt_period = 1000;
 
 /*
  * ratio of time -rt tasks may consume.
- * default: 100%
+ * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = SCHED_RT_FRAC;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1337,7 +1368,7 @@ unsigned long weighted_cpuload(const int cpu)
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	set_task_cfs_rq(p, cpu);
+	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -5281,7 +5312,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		p->sched_class->set_cpus_allowed(p, &new_mask);
 	else {
 		p->cpus_allowed = new_mask;
-		p->nr_cpus_allowed = cpus_weight(new_mask);
+		p->rt.nr_cpus_allowed = cpus_weight(new_mask);
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
@@ -7079,8 +7110,50 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rq = rq;
+#endif
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
+		struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int cpu, int add)
+{
+	tg->cfs_rq[cpu] = cfs_rq;
+	init_cfs_rq(cfs_rq, rq);
+	cfs_rq->tg = tg;
+	if (add)
+		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+	tg->se[cpu] = se;
+	se->cfs_rq = &rq->cfs;
+	se->my_q = cfs_rq;
+	se->load.weight = tg->shares;
+	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+	se->parent = NULL;
+}
+
+static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
+		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+		int cpu, int add)
+{
+	tg->rt_rq[cpu] = rt_rq;
+	init_rt_rq(rt_rq, rq);
+	rt_rq->tg = tg;
+	rt_rq->rt_se = rt_se;
+	if (add)
+		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+
+	tg->rt_se[cpu] = rt_se;
+	rt_se->rt_rq = &rq->rt;
+	rt_se->my_q = rt_rq;
+	rt_se->parent = NULL;
+	INIT_LIST_HEAD(&rt_se->run_list);
+}
+#endif
+
 void __init sched_init(void)
 {
 	int highest_cpu = 0;
@@ -7090,6 +7163,10 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	list_add(&init_task_group.list, &task_groups);
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -7099,30 +7176,20 @@ void __init sched_init(void)
 		rq->nr_running = 0;
 		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
+		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		{
-			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
-			struct sched_entity *se =
-					 &per_cpu(init_sched_entity, i);
-
-			init_cfs_rq_p[i] = cfs_rq;
-			init_cfs_rq(cfs_rq, rq);
-			cfs_rq->tg = &init_task_group;
-			list_add(&cfs_rq->leaf_cfs_rq_list,
-							 &rq->leaf_cfs_rq_list);
-
-			init_sched_entity_p[i] = se;
-			se->cfs_rq = &rq->cfs;
-			se->my_q = cfs_rq;
-			se->load.weight = init_task_group_load;
-			se->load.inv_weight =
-				 div64_64(1ULL<<32, init_task_group_load);
-			se->parent = NULL;
-		}
 		init_task_group.shares = init_task_group_load;
+		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		init_tg_cfs_entry(rq, &init_task_group,
+				&per_cpu(init_cfs_rq, i),
+				&per_cpu(init_sched_entity, i), i, 1);
+
+		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+		init_tg_rt_entry(rq, &init_task_group,
+				&per_cpu(init_rt_rq, i),
+				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
-		init_rt_rq(&rq->rt, rq);
 		rq->rt_period_expire = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7460,12 +7527,36 @@ static int load_balance_monitor(void *unused)
 }
 #endif	/* CONFIG_SMP */
 
+static void free_sched_group(struct task_group *tg)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq)
+			kfree(tg->cfs_rq[i]);
+		if (tg->se)
+			kfree(tg->se[i]);
+		if (tg->rt_rq)
+			kfree(tg->rt_rq[i]);
+		if (tg->rt_se)
+			kfree(tg->rt_se[i]);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg->rt_rq);
+	kfree(tg->rt_se);
+	kfree(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
 	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct rt_rq *rt_rq;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
@@ -7479,100 +7570,89 @@ struct task_group *sched_create_group(void)
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_rq)
+		goto err;
+	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_se)
+		goto err;
+
+	tg->shares = NICE_0_LOAD;
+	tg->rt_ratio = 0; /* XXX */
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
-							 cpu_to_node(i));
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 
-		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
-							cpu_to_node(i));
+		se = kmalloc_node(sizeof(struct sched_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!se)
 			goto err;
 
-		memset(cfs_rq, 0, sizeof(struct cfs_rq));
-		memset(se, 0, sizeof(struct sched_entity));
+		rt_rq = kmalloc_node(sizeof(struct rt_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_rq)
+			goto err;
 
-		tg->cfs_rq[i] = cfs_rq;
-		init_cfs_rq(cfs_rq, rq);
-		cfs_rq->tg = tg;
+		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_se)
+			goto err;
 
-		tg->se[i] = se;
-		se->cfs_rq = &rq->cfs;
-		se->my_q = cfs_rq;
-		se->load.weight = NICE_0_LOAD;
-		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
-		se->parent = NULL;
+		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 
-	tg->shares = NICE_0_LOAD;
-
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	}
+	list_add_rcu(&tg->list, &task_groups);
 	unlock_task_group_list();
 
 	return tg;
 
 err:
-	for_each_possible_cpu(i) {
-		if (tg->cfs_rq)
-			kfree(tg->cfs_rq[i]);
-		if (tg->se)
-			kfree(tg->se[i]);
-	}
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
-
+	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group(struct rcu_head *rhp)
+static void free_sched_group_rcu(struct rcu_head *rhp)
 {
-	struct task_group *tg = container_of(rhp, struct task_group, rcu);
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se;
-	int i;
-
 	/* now it should be safe to free those cfs_rqs */
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		kfree(cfs_rq);
-
-		se = tg->se[i];
-		kfree(se);
-	}
-
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
+	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	struct cfs_rq *cfs_rq = NULL;
+	struct rt_rq *rt_rq = NULL;
 	int i;
 
 	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_del_rcu(&rt_rq->leaf_rt_rq_list);
 	}
+	list_del_rcu(&tg->list);
 	unlock_task_group_list();
 
 	BUG_ON(!cfs_rq);
 
 	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, free_sched_group);
+	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 
 /* change task's runqueue when it moves between groups.
@@ -7588,11 +7668,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	rq = task_rq_lock(tsk, &flags);
 
-	if (tsk->sched_class != &fair_sched_class) {
-		set_task_cfs_rq(tsk, task_cpu(tsk));
-		goto done;
-	}
-
 	update_rq_clock(rq);
 
 	running = task_current(rq, tsk);
@@ -7604,7 +7679,7 @@ void sched_move_task(struct task_struct *tsk)
 			tsk->sched_class->put_prev_task(rq, tsk);
 	}
 
-	set_task_cfs_rq(tsk, task_cpu(tsk));
+	set_task_rq(tsk, task_cpu(tsk));
 
 	if (on_rq) {
 		if (unlikely(running))
@@ -7612,7 +7687,6 @@ void sched_move_task(struct task_struct *tsk)
 		enqueue_task(rq, tsk, 0);
 	}
 
-done:
 	task_rq_unlock(rq, &flags);
 }
 
@@ -7697,6 +7771,31 @@ unsigned long sched_group_shares(struct task_group *tg)
 	return tg->shares;
 }
 
+/*
+ * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ */
+int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+{
+	struct task_group *tgi;
+	unsigned long total = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tgi, &task_groups, list)
+		total += tgi->rt_ratio;
+	rcu_read_unlock();
+
+	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+		return -EINVAL;
+
+	tg->rt_ratio = rt_ratio;
+	return 0;
+}
+
+unsigned long sched_group_rt_ratio(struct task_group *tg)
+{
+	return tg->rt_ratio;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7772,12 +7871,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 	return (u64) tg->shares;
 }
 
+static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_ratio_val)
+{
+	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+}
+
+static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return (u64) tg->rt_ratio;
+}
+
 static struct cftype cpu_files[] = {
 	{
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
+	{
+		.name = "rt_ratio",
+		.read_uint = cpu_rt_ratio_read_uint,
+		.write_uint = cpu_rt_ratio_write_uint,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fd10d96..1178257 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -45,47 +45,167 @@ static void update_rt_migration(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
-static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 {
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	if (!rt_rq->tg)
+		return SCHED_RT_FRAC;
+
+	return rt_rq->tg->rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		enqueue_rt_entity(rt_se);
+		resched_task(rq_of_rt_rq(rt_rq)->curr);
+	}
+}
+
+static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && on_rt_rq(rt_se))
+		dequeue_rt_entity(rt_se);
+}
+
+#else
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	return sysctl_sched_rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->rt;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return NULL;
+}
+
+static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+}
+
+static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+}
+
+#endif
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return rt_rq->highest_prio;
+#endif
+
+	return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+{
+	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
 	u64 period, ratio;
 
-	if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
+	if (rt_ratio == SCHED_RT_FRAC)
 		return 0;
 
 	if (rt_rq->rt_throttled)
 		return 1;
 
 	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-	ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
-		rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
+		rt_rq->rt_throttled = 1;
+		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
 
 	return 0;
 }
 
+static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
+{
+	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+	if (rt_rq->rt_throttled) {
+		rt_rq->rt_throttled = 0;
+		sched_rt_ratio_enqueue(rt_rq);
+	}
+}
+
 static void update_sched_rt_period(struct rq *rq)
 {
-	while (rq->clock > rq->rt_period_expire) {
-		u64 period, ratio;
+	struct rt_rq *rt_rq;
+	u64 period;
 
+	while (rq->clock > rq->rt_period_expire) {
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-		ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-		rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
 		rq->rt_period_expire += period;
-	}
 
-	/*
-	 * When the rt throttle is expired, let them rip.
-	 * (XXX: use hrtick when available)
-	 */
-	if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
-		rq->rt.rt_throttled = 0;
-		if (!sched_rt_ratio_exceeded(rq, &rq->rt))
-			resched_task(rq->curr);
+		for_each_leaf_rt_rq(rt_rq, rq)
+			__update_sched_rt_period(rt_rq, period);
 	}
 }
 
@@ -96,6 +216,8 @@ static void update_sched_rt_period(struct rq *rq)
 static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
+	struct sched_rt_entity *rt_se = &curr->rt;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	u64 delta_exec;
 
 	if (!task_has_rt_policy(curr))
@@ -111,95 +233,184 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
-	rq->rt.rt_time += delta_exec;
-	update_sched_rt_period(rq);
-	if (sched_rt_ratio_exceeded(rq, &rq->rt))
+	rt_rq->rt_time += delta_exec;
+	/*
+	 * might make it a tad more accurate:
+	 *
+	 * update_sched_rt_period(rq);
+	 */
+	if (sched_rt_ratio_exceeded(rt_rq))
 		resched_task(curr);
 }
 
-static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_task(p));
-	rq->rt.rt_nr_running++;
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+		rt_rq->highest_prio = rt_se_prio(rt_se);
+#endif
 #ifdef CONFIG_SMP
-	if (p->prio < rq->rt.highest_prio)
-		rq->rt.highest_prio = p->prio;
-	if (p->nr_cpus_allowed > 1)
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rq->rt.rt_nr_migratory++;
+	}
 
-	update_rt_migration(rq);
-#endif /* CONFIG_SMP */
+	update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif
 }
 
-static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_task(p));
-	WARN_ON(!rq->rt.rt_nr_running);
-	rq->rt.rt_nr_running--;
-#ifdef CONFIG_SMP
-	if (rq->rt.rt_nr_running) {
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	WARN_ON(!rt_rq->rt_nr_running);
+	rt_rq->rt_nr_running--;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_rq->rt_nr_running) {
 		struct rt_prio_array *array;
 
-		WARN_ON(p->prio < rq->rt.highest_prio);
-		if (p->prio == rq->rt.highest_prio) {
+		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
+		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
 			/* recalculate */
-			array = &rq->rt.active;
-			rq->rt.highest_prio =
+			array = &rt_rq->active;
+			rt_rq->highest_prio =
 				sched_find_first_bit(array->bitmap);
 		} /* otherwise leave rq->highest prio alone */
 	} else
-		rq->rt.highest_prio = MAX_RT_PRIO;
-	if (p->nr_cpus_allowed > 1)
+		rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rq->rt.rt_nr_migratory--;
+	}
 
-	update_rt_migration(rq);
+	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 }
 
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	list_add_tail(&p->rt.run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	inc_cpu_load(rq, p->se.load.weight);
+	if (group_rq && group_rq->rt_throttled)
+		return;
 
-	inc_rt_tasks(p, rq);
+	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
-	if (wakeup)
-		p->rt.timeout = 0;
+	inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_del_init(&rt_se->run_list);
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ *
+ * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
+ *      doesn't matter much for now, as h=2 for GROUP_SCHED.
+ */
+static void dequeue_rt_stack(struct task_struct *p)
+{
+	struct sched_rt_entity *rt_se, *top_se;
+
+	/*
+	 * dequeue all, top - down.
+	 */
+	do {
+		rt_se = &p->rt;
+		top_se = NULL;
+		for_each_sched_rt_entity(rt_se) {
+			if (on_rt_rq(rt_se))
+				top_se = rt_se;
+		}
+		if (top_se)
+			dequeue_rt_entity(top_se);
+	} while (top_se);
 }
 
 /*
  * Adding/removing a task to/from a priority array:
  */
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+{
+	struct sched_rt_entity *rt_se = &p->rt;
+
+	if (wakeup)
+		rt_se->timeout = 0;
+
+	dequeue_rt_stack(p);
+
+	/*
+	 * enqueue everybody, bottom - up.
+	 */
+	for_each_sched_rt_entity(rt_se)
+		enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
+}
+
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
 	update_curr_rt(rq);
 
-	list_del(&p->rt.run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
-	dec_cpu_load(rq, p->se.load.weight);
+	dequeue_rt_stack(p);
+
+	/*
+	 * re-enqueue all non-empty rt_rq entities.
+	 */
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running)
+			enqueue_rt_entity(rt_se);
+	}
 
-	dec_rt_tasks(p, rq);
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
+static
+void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+}
+
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
-	list_move_tail(&p->rt.run_list, array->queue + p->prio);
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = rt_rq_of_se(rt_se);
+		requeue_rt_entity(rt_rq, rt_se);
+	}
 }
 
-static void
-yield_task_rt(struct rq *rq)
+static void yield_task_rt(struct rq *rq)
 {
 	requeue_task_rt(rq, rq->curr);
 }
@@ -229,7 +440,7 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 * cold cache anyway.
 	 */
 	if (unlikely(rt_task(rq->curr)) &&
-	    (p->nr_cpus_allowed > 1)) {
+	    (p->rt.nr_cpus_allowed > 1)) {
 		int cpu = find_lowest_rq(p);
 
 		return (cpu == -1) ? task_cpu(p) : cpu;
@@ -252,27 +463,51 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 		resched_task(rq->curr);
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq)
 {
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *next = NULL;
 	struct list_head *queue;
-	struct rt_rq *rt_rq = &rq->rt;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rq, rt_rq))
-		return NULL;
+	if (sched_rt_ratio_exceeded(rt_rq))
+		goto out;
 
 	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
+	BUG_ON(idx >= MAX_RT_PRIO);
 
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, rt.run_list);
+	next = list_entry(queue->next, struct sched_rt_entity, run_list);
+ out:
+	return next;
+}
 
-	next->se.exec_start = rq->clock;
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct sched_rt_entity *rt_se;
+	struct task_struct *p;
+	struct rt_rq *rt_rq;
 
-	return next;
+ retry:
+	rt_rq = &rq->rt;
+
+	if (unlikely(!rt_rq->rt_nr_running))
+		return NULL;
+
+	if (sched_rt_ratio_exceeded(rt_rq))
+		return NULL;
+
+	do {
+		rt_se = pick_next_rt_entity(rq, rt_rq);
+		if (unlikely(!rt_se))
+			goto retry;
+		rt_rq = group_rt_rq(rt_se);
+	} while (rt_rq);
+
+	p = rt_task_of(rt_se);
+	p->se.exec_start = rq->clock;
+	return p;
 }
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
@@ -282,6 +517,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 
 #ifdef CONFIG_SMP
+
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
@@ -292,7 +528,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
 	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
-	    (p->nr_cpus_allowed > 1))
+	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
 }
@@ -300,52 +536,33 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 /* Return the second highest RT task, NULL otherwise */
 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 {
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
-	struct list_head *queue;
+	struct task_struct *next = NULL;
+	struct sched_rt_entity *rt_se;
+	struct rt_prio_array *array;
+	struct rt_rq *rt_rq;
 	int idx;
 
-	if (likely(rq->rt.rt_nr_running < 2))
-		return NULL;
-
-	idx = sched_find_first_bit(array->bitmap);
-	if (unlikely(idx >= MAX_RT_PRIO)) {
-		WARN_ON(1); /* rt_nr_running is bad */
-		return NULL;
-	}
-
-	queue = array->queue + idx;
-	BUG_ON(list_empty(queue));
-
-	next = list_entry(queue->next, struct task_struct, rt.run_list);
-	if (unlikely(pick_rt_task(rq, next, cpu)))
-		goto out;
-
-	if (queue->next->next != queue) {
-		/* same prio task */
-		next = list_entry(queue->next->next, struct task_struct,
-				  rt.run_list);
-		if (pick_rt_task(rq, next, cpu))
-			goto out;
-	}
-
- retry:
-	/* slower, but more flexible */
-	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-	if (unlikely(idx >= MAX_RT_PRIO))
-		return NULL;
-
-	queue = array->queue + idx;
-	BUG_ON(list_empty(queue));
-
-	list_for_each_entry(next, queue, rt.run_list) {
-		if (pick_rt_task(rq, next, cpu))
-			goto out;
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		array = &rt_rq->active;
+		idx = sched_find_first_bit(array->bitmap);
+ next_idx:
+		if (idx >= MAX_RT_PRIO)
+			continue;
+		if (next && next->prio < idx)
+			continue;
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+			struct task_struct *p = rt_task_of(rt_se);
+			if (pick_rt_task(rq, p, cpu)) {
+				next = p;
+				break;
+			}
+		}
+		if (!next) {
+			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+			goto next_idx;
+		}
 	}
 
-	goto retry;
-
- out:
 	return next;
 }
 
@@ -774,12 +991,12 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	 * Update the migration status of the RQ if we have an RT task
 	 * which is running AND changing its weight value.
 	 */
-	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
+	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
-		if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
+		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
-		} else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
 			BUG_ON(!rq->rt.rt_nr_migratory);
 			rq->rt.rt_nr_migratory--;
 		}
@@ -788,7 +1005,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 	}
 
 	p->cpus_allowed    = *new_mask;
-	p->nr_cpus_allowed = weight;
+	p->rt.nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
-- 
cgit v1.1


From 614ee1f61f667b02165c1ae0c1357048dc6d94a0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 25 Jan 2008 21:08:30 +0100
Subject: sched: pull_rt_task() cleanup

"goto out" is an odd way to spell "skip".

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1178257..1144bf5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -877,10 +877,8 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * Are there still pullable RT tasks?
 		 */
-		if (src_rq->rt.rt_nr_running <= 1) {
-			spin_unlock(&src_rq->lock);
-			continue;
-		}
+		if (src_rq->rt.rt_nr_running <= 1)
+			goto skip;
 
 		p = pick_next_highest_task_rt(src_rq, this_cpu);
 
@@ -904,7 +902,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 */
 			if (p->prio < src_rq->curr->prio ||
 			    (next && next->prio < src_rq->curr->prio))
-				goto out;
+				goto skip;
 
 			ret = 1;
 
@@ -924,7 +922,7 @@ static int pull_rt_task(struct rq *this_rq)
 			next = p;
 
 		}
- out:
+ skip:
 		spin_unlock(&src_rq->lock);
 	}
 
-- 
cgit v1.1


From 48d5e258216f1c7713633439beb98a38c7290649 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: sched: rt throttling vs no_hz

We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c           | 23 ++++++++++++++++++++++-
 kernel/sched_rt.c        | 30 ++++++++++++++++--------------
 kernel/time/tick-sched.c |  5 +++++
 3 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5ea2c53..22712b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	u64 rt_period_expire;
+	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7102,9 +7120,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-	rt_rq->highest_prio = MAX_RT_PRIO;
 	rt_rq->overloaded = 0;
 #endif
 
@@ -7191,6 +7211,7 @@ void __init sched_init(void)
 				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
 		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1144bf5..8bfdb3f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
 
 	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
+
 		sched_rt_ratio_dequeue(rt_rq);
 		return 1;
 	}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-	unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-	u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-	rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-	if (rt_rq->rt_throttled) {
-		rt_rq->rt_throttled = 0;
-		sched_rt_ratio_enqueue(rt_rq);
-	}
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struct rq *rq)
 		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
 		rq->rt_period_expire += period;
 
-		for_each_leaf_rt_rq(rt_rq, rq)
-			__update_sched_rt_period(rt_rq, period);
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
 	}
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8..5f9fb64 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*
-- 
cgit v1.1


From 2d44ae4d7135b9aee26439b3523b43473381bc5f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: hrtimer: clean up cpu->base locking tricks

In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c         | 20 +++++++++++++++++++-
 kernel/time/tick-sched.c |  8 --------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f994bb8..9f850ca 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
+			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
+			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 					 HRTIMER_STATE_CALLBACK, 0);
 			timer_stats_account_hrtimer(timer);
 
+			fn = timer->function;
+			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+				/*
+				 * Used for scheduler timers, avoid lock
+				 * inversion with rq->lock and tasklist_lock.
+				 *
+				 * These timers are required to deal with
+				 * enqueue expiry themselves and are not
+				 * allowed to migrate.
+				 */
+				spin_unlock(&cpu_base->lock);
+				restart = fn(timer);
+				spin_lock(&cpu_base->lock);
+			} else
+				restart = fn(timer);
+
 			/*
 			 * Note: We clear the CALLBACK bit after
 			 * enqueue_hrtimer to avoid reprogramming of
 			 * the event hardware. This happens at the end
 			 * of this function anyway.
 			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
+			if (restart != HRTIMER_NORESTART) {
 				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
 				enqueue_hrtimer(timer, base, 0);
 			}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5f9fb64..1a21b6f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 
 	/* Do not restart, when we are in the idle loop */
-- 
cgit v1.1


From d3d74453c34f8fd87674a8cf5b8a327c68f22e99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:31 +0100
Subject: hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback

Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 270 ++++++++++++++++++++++++++++---------------------------
 kernel/timer.c   |   3 +-
 2 files changed, 141 insertions(+), 132 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9f850ca..061ae28 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
 }
 #endif /* BITS_PER_LONG >= 64 */
 
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
 }
 
 /*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
-/*
  * Initialize the high resolution related parts of cpu_base
  */
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
 	base->expires_next.tv64 = KTIME_MAX;
 	base->hres_active = 0;
-	INIT_LIST_HEAD(&base->cb_pending);
 }
 
 /*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
  */
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
-	INIT_LIST_HEAD(&timer->cb_entry);
 }
 
 /*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
-static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &cpu_base->clock_base[clock_id];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	hrtimer_init_timer_hres(timer);
 
 #ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1144,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = base->first)) {
-			enum hrtimer_restart (*fn)(struct hrtimer *);
 			struct hrtimer *timer;
-			int restart;
 
 			timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				continue;
 			}
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			fn = timer->function;
-			if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
-				/*
-				 * Used for scheduler timers, avoid lock
-				 * inversion with rq->lock and tasklist_lock.
-				 *
-				 * These timers are required to deal with
-				 * enqueue expiry themselves and are not
-				 * allowed to migrate.
-				 */
-				spin_unlock(&cpu_base->lock);
-				restart = fn(timer);
-				spin_lock(&cpu_base->lock);
-			} else
-				restart = fn(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (restart != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
 		base++;
@@ -1140,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
@@ -1199,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 
-	spin_lock_irq(&cpu_base->lock);
+	spin_lock(&cpu_base->lock);
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
+
+		__run_hrtimer(timer);
 	}
-	spin_unlock_irq(&cpu_base->lock);
+	spin_unlock(&cpu_base->lock);
 }
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1247,18 +1266,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 		return;
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1407,6 +1414,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
+	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 }
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 2a00c22..f739dfb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
  */
 void run_local_timers(void)
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 }
-- 
cgit v1.1


From 37bb6cb4097e29ffee970065b74499cbf10603a3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: hrtimer: unlock hrtimer_wakeup

hrtimer_wakeup creates a

  base->lock
    rq->lock

lock dependancy. Avoid this by switching to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
which doesn't hold base->lock.

This fully untangles hrtimer locks from the scheduler locks, and allows
hrtimer usage in the scheduler proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 061ae28..bd5d6b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1293,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1304,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 		if (likely(t->task))
 			schedule();
-- 
cgit v1.1


From 1020387f5f3b52929b387103cf976321981f8e26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: sched: rt-group: reduce rescheduling

Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8bfdb3f..f1f215d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
 		enqueue_rt_entity(rt_se);
-		resched_task(rq_of_rt_rq(rt_rq)->curr);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
 	}
 }
 
-- 
cgit v1.1


From 5a52dd50091b6a6e710a1293db741028f8cc5aac Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:32 +0100
Subject: sched: rt-watchdog: fix .rlim_max = RLIM_INFINITY

Remove the curious logic to set it_sched_expires in the future. It useless
because rt.timeout wouldn't be incremented anyway.

Explicity check for RLIM_INFINITY as a test programm that had a 1s soft limit
and a inf hard limit would SIGKILL at 1s. This is because RLIM_INFINITY+d-1
is d-2.

Signed-off-by: Peter Zijlsta <a.p.zijlstra@chello.nl>
CC: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 3 ++-
 kernel/sched_rt.c         | 8 +-------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2c076b3..0b7c82a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1020,7 +1020,8 @@ static void check_thread_timers(struct task_struct *tsk,
 		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
 		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
 
-		if (tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		if (hard != RLIM_INFINITY &&
+		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f1f215d..2dac5eb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1125,13 +1125,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 
 		p->rt.timeout++;
 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-		if (next > p->rt.timeout) {
-			u64 next_time = p->se.sum_exec_runtime;
-
-			next_time += next * (NSEC_PER_SEC/HZ);
-			if (p->it_sched_expires > next_time)
-				p->it_sched_expires = next_time;
-		} else
+		if (p->rt.timeout > next)
 			p->it_sched_expires = p->se.sum_exec_runtime;
 	}
 }
-- 
cgit v1.1


From 21aa9280b9f4e9e68d3fa8990df6c9d7fd71f994 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: show being-loaded/being-unloaded indicator for modules

It's rather common that an oops/WARN_ON/BUG happens during the load or
unload of a module. Unfortunatly, it's not always easy to see directly
which module is being loaded/unloaded from the oops itself. Worse,
it's not even always possible to ask the bug reporter, since there
are so many components (udev etc) that auto-load modules that there's
a good chance that even the reporter doesn't know which module this is.

This patch extends the existing "show if it's tainting" print code,
which is used as part of printing the modules in the oops/BUG/WARN_ON
to include a "+" for "being loaded" and a "-" for "being unloaded".

As a result this extension, the "taint_flags()" function gets renamed to
"module_flags()" (and takes a module struct as argument, not a taint
flags int).

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index dcb8a2c..5bbf225 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2357,21 +2357,30 @@ static void m_stop(struct seq_file *m, void *p)
 	mutex_unlock(&module_mutex);
 }
 
-static char *taint_flags(unsigned int taints, char *buf)
+static char *module_flags(struct module *mod, char *buf)
 {
 	int bx = 0;
 
-	if (taints) {
+	if (mod->taints ||
+	    mod->state == MODULE_STATE_GOING ||
+	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
-		if (taints & TAINT_PROPRIETARY_MODULE)
+		if (mod->taints & TAINT_PROPRIETARY_MODULE)
 			buf[bx++] = 'P';
-		if (taints & TAINT_FORCED_MODULE)
+		if (mod->taints & TAINT_FORCED_MODULE)
 			buf[bx++] = 'F';
 		/*
 		 * TAINT_FORCED_RMMOD: could be added.
 		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
 		 * apply to modules.
 		 */
+
+		/* Show a - for module-is-being-unloaded */
+		if (mod->state == MODULE_STATE_GOING)
+			buf[bx++] = '-';
+		/* Show a + for module-is-being-loaded */
+		if (mod->state == MODULE_STATE_COMING)
+			buf[bx++] = '+';
 		buf[bx++] = ')';
 	}
 	buf[bx] = '\0';
@@ -2398,7 +2407,7 @@ static int m_show(struct seq_file *m, void *p)
 
 	/* Taints info */
 	if (mod->taints)
-		seq_printf(m, " %s", taint_flags(mod->taints, buf));
+		seq_printf(m, " %s", module_flags(mod, buf));
 
 	seq_printf(m, "\n");
 	return 0;
@@ -2493,7 +2502,7 @@ void print_modules(void)
 
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
-		printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
+		printk(" %s%s", mod->name, module_flags(mod, buf));
 	printk("\n");
 }
 
-- 
cgit v1.1


From e14af7eeb47ea96c52741c5e5fa010d33daf6973 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: track and print last unloaded module in the oops trace

Based on a suggestion from Andi:

 In various cases, the unload of a module may leave some bad state around
 that causes a kernel crash AFTER a module is unloaded; and it's then hard
 to find which module caused that.

This patch tracks the last unloaded module, and prints this as part of the
module list in the oops trace.

Right now, only the last 1 module is tracked; I expect that this is enough
for the vast majority of cases where this information matters; if it turns
out that tracking more is important, we can always extend it to that.

[ mingo@elte.hu: build fix ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5bbf225..1bb4c5e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
 #ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
@@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mod->exit();
 		mutex_lock(&module_mutex);
 	}
+	/* Store the name of the last unloaded module for diagnostic purposes */
+	sprintf(last_unloaded_module, mod->name);
 	free_module(mod);
 
  out:
@@ -2503,6 +2507,8 @@ void print_modules(void)
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
 		printk(" %s%s", mod->name, module_flags(mod, buf));
+	if (last_unloaded_module[0])
+		printk(" [last unloaded: %s]", last_unloaded_module);
 	printk("\n");
 }
 
-- 
cgit v1.1


From 58b8a73ab8becfcaea84abc2a06038281efa4c8a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: make PREEMPT_BKL the default

make PREEMPT_BKL the default.

precursor to removal of the !PREEMPT_BKL code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 61fa116..4420ef4 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -53,15 +53,8 @@ config PREEMPT
 endchoice
 
 config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
+	def_bool y
 	depends on SMP || PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
 
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
-- 
cgit v1.1


From 6478d8800b75253b2a934ddcb734e13ade023ad0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: remove the !PREEMPT_BKL code

remove the !PREEMPT_BKL code.

this removes 160 lines of legacy code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt |  4 ----
 kernel/sched.c         | 19 +++----------------
 2 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4420ef4..0669b70 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,6 @@ config PREEMPT
 
 endchoice
 
-config PREEMPT_BKL
-	def_bool y
-	depends on SMP || PREEMPT
-
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
 	select DEBUG_FS
diff --git a/kernel/sched.c b/kernel/sched.c
index 22712b2..629614a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3955,10 +3955,9 @@ EXPORT_SYMBOL(schedule);
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
-#endif
+
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
@@ -3974,14 +3973,10 @@ asmlinkage void __sched preempt_schedule(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-#endif
 		schedule();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -4002,10 +3997,9 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
-#endif
+
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
@@ -4017,16 +4011,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
-#endif
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -5241,11 +5231,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
 	task_thread_info(idle)->preempt_count = 0;
-#endif
+
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
-- 
cgit v1.1


From 1ad82fd547c716f96e544b477e0bdbfa2d647529 Mon Sep 17 00:00:00 2001
From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: debug: clean up kernel/profile.c

 Before:
 total: 25 errors, 13 warnings, 602 lines checked

 After:
 total: 0 errors, 2 warnings, 601 lines checked

No code changed:

kernel/profile.o:
   text    data     bss     dec     hex filename
   3048     236      24    3308     cec profile.o.before
   3048     236      24    3308     cec profile.o.after
 md5:
   2501d64748a4d350dffb11203e2a5182  profile.o.before.asm
   2501d64748a4d350dffb11203e2a5182  profile.o.after.asm

Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/profile.c | 99 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330..e64c2da 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
 static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 
-static int __init profile_setup(char * str)
+static int __init profile_setup(char *str)
 {
 	static char __initdata schedstr[] = "schedule";
 	static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
 
 void __init profile_init(void)
 {
-	if (!prof_on) 
+	if (!prof_on)
 		return;
- 
+
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
 }
 
 /* Profile event notifications */
- 
+
 #ifdef CONFIG_PROFILING
- 
+
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
- 
-void profile_task_exit(struct task_struct * task)
+
+void profile_task_exit(struct task_struct *task)
 {
 	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
- 
-int profile_handoff_task(struct task_struct * task)
+
+int profile_handoff_task(struct task_struct *task)
 {
 	int ret;
 	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
 	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 
-int task_handoff_register(struct notifier_block * n)
+int task_handoff_register(struct notifier_block *n)
 {
 	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
+EXPORT_SYMBOL_GPL(task_handoff_register);
 
-int task_handoff_unregister(struct notifier_block * n)
+int task_handoff_unregister(struct notifier_block *n)
 {
 	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
 
-int profile_event_register(enum profile_type type, struct notifier_block * n)
+int profile_event_register(enum profile_type type, struct notifier_block *n)
 {
 	int err = -EINVAL;
- 
+
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_register(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_register(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_register(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_register(
+				&munmap_notifier, n);
+		break;
 	}
- 
+
 	return err;
 }
+EXPORT_SYMBOL_GPL(profile_event_register);
 
- 
-int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 {
 	int err = -EINVAL;
- 
+
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_unregister(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_unregister(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_unregister(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_unregister(
+				&munmap_notifier, n);
+		break;
 	}
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 int register_timer_hook(int (*hook)(struct pt_regs *))
 {
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
 	timer_hook = hook;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(register_timer_hook);
 
 void unregister_timer_hook(int (*hook)(struct pt_regs *))
 {
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 	/* make sure all CPUs see the NULL hook */
 	synchronize_sched();  /* Allow ongoing interrupts to complete. */
 }
-
-EXPORT_SYMBOL_GPL(register_timer_hook);
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-EXPORT_SYMBOL_GPL(task_handoff_register);
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-EXPORT_SYMBOL_GPL(profile_event_register);
-EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 #endif /* CONFIG_PROFILING */
 
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
 		}
 		break;
-	out_free:
+out_free:
 		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
 		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
 		__free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
 	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
-
 EXPORT_SYMBOL_GPL(profile_hits);
 
 void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 
-static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
 	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
 	return len;
 }
 
-static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
+static int prof_cpu_mask_write_proc(struct file *file,
+	const char __user *buffer,  unsigned long count, void *data)
 {
 	cpumask_t *mask = (cpumask_t *)data;
 	unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
 	struct proc_dir_entry *entry;
 
 	/* create /proc/irq/prof_cpu_mask */
-	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+	entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
+	if (!entry)
 		return;
 	entry->data = (void *)&prof_cpu_mask;
 	entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	unsigned long p = *ppos;
 	ssize_t read;
-	char * pnt;
+	char *pnt;
 	unsigned int sample_step = 1 << prof_shift;
 
 	profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	read = 0;
 
 	while (p < sizeof(unsigned int) && count > 0) {
-		if (put_user(*((char *)(&sample_step)+p),buf))
+		if (put_user(*((char *)(&sample_step)+p), buf))
 			return -EFAULT;
 		buf++; p++; count--; read++;
 	}
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
-	if (copy_to_user(buf,(void *)pnt,count))
+	if (copy_to_user(buf, (void *)pnt, count))
 		return -EFAULT;
 	read += count;
 	*ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
 #ifdef CONFIG_SMP
-	extern int setup_profiling_timer (unsigned int multiplier);
+	extern int setup_profiling_timer(unsigned int multiplier);
 
 	if (count == sizeof(int)) {
 		unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
 		return 0;
 	if (create_hash_tables())
 		return -1;
-	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+	entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+	if (!entry)
 		return 0;
 	entry->proc_fops = &proc_profile_operations;
 	entry->size = (1+prof_len) * sizeof(atomic_t);
-- 
cgit v1.1


From 4f05b98d54b140ed3c5851d5d5156e9918c6305d Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: fix, always create kernel threads with normal priority

Ensure that the kernel threads are created with the usual nice level
and affinity even if kthreadd's properties were changed from the
default by root.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724..0ac8878 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
 
+#define KTHREAD_NICE_LEVEL (-5)
+
 static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
 	if (pid < 0) {
 		create->result = ERR_PTR(pid);
 	} else {
+		struct sched_param param = { .sched_priority = 0 };
 		wait_for_completion(&create->started);
 		read_lock(&tasklist_lock);
 		create->result = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
+		/*
+		 * root may have changed our (kthreadd's) priority or CPU mask.
+		 * The kernel thread should not inherit these properties.
+		 */
+		sched_setscheduler(create->result, SCHED_NORMAL, &param);
+		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
+		set_cpus_allowed(create->result, CPU_MASK_ALL);
 	}
 	complete(&create->done);
 }
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_user_nice(tsk, -5);
+	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 
 	current->flags |= PF_NOFREEZE;
-- 
cgit v1.1


From 782daeee3d596282bfee4cd9e976c86be0e194a8 Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Fri, 25 Jan 2008 21:08:33 +0100
Subject: sched: fix rq->clock warps on frequency changes

sched: fix rq->clock warps on frequency changes

Fix 2bacec8c318ca0418c0ee9ac662ee44207765dd4
(sched: touch softlockup watchdog after idling) that reintroduced warps
on frequency changes. touch_softlockup_watchdog() calls __update_rq_clock
that checks rq->clock for warps, so call it after adjusting rq->clock.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 629614a..3995d16 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -858,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 
-	touch_softlockup_watchdog();
 	rq->idle_clock += delta_ns;
 	/*
 	 * Override the previous timestamp and ignore all
@@ -870,6 +869,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	rq->prev_clock_raw = now;
 	rq->clock += delta_ns;
 	spin_unlock(&rq->lock);
+	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-- 
cgit v1.1


From cc203d2422004498909c4886d1b94a2e388d973e Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: monitor clock underflows in /proc/sched_debug

We monitor clock overflows, let's also monitor clock underflows.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 6 ++++--
 kernel/sched_debug.c | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3995d16..4d3a5a7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -465,7 +465,7 @@ struct rq {
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
 
-	unsigned int clock_warps, clock_overflows;
+	unsigned int clock_warps, clock_overflows, clock_underflows;
 	u64 idle_clock;
 	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
@@ -3736,8 +3736,10 @@ void scheduler_tick(void)
 	/*
 	 * Let rq->clock advance by at least TICK_NSEC:
 	 */
-	if (unlikely(rq->clock < next_tick))
+	if (unlikely(rq->clock < next_tick)) {
 		rq->clock = next_tick;
+		rq->clock_underflows++;
+	}
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 80fbbfc..9e5de09 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
+	P(clock_underflows);
 	P(clock_deep_idle_events);
 	PN(clock_max_delta);
 	P(cpu_load[0]);
-- 
cgit v1.1


From 326587b840785c60f5dc18557235a23bafefd620 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: fix goto retry in pick_next_task_rt()

looking at it one more time:

(1) it looks to me that there is no need to call
sched_rt_ratio_exceeded() from pick_next_rt_entity()

- [ for CONFIG_FAIR_GROUP_SCHED ] queues with rt_rq->rt_throttled are
not within this 'tree-like hierarchy' (or whatever we should call it
:-)

- there is also no need to re-check 'rt_rq->rt_time > ratio' at this
point as 'rt_rq->rt_time' couldn't have been increased since the last
call to update_curr_rt() (which obviously calls
sched_rt_ratio_esceeded())
well, it might be that 'ratio' for this rt_rq has been re-configured
(and the period over which this rt_rq was active has not yet been
finished)... but I don't think we should really take this into
account.

(2) now pick_next_rt_entity() must never return NULL, so let's change
pick_next_task_rt() accordingly.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2dac5eb..274b40d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -476,15 +476,12 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	struct list_head *queue;
 	int idx;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
-		goto out;
-
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct sched_rt_entity, run_list);
- out:
+
 	return next;
 }
 
@@ -494,7 +491,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	struct task_struct *p;
 	struct rt_rq *rt_rq;
 
- retry:
 	rt_rq = &rq->rt;
 
 	if (unlikely(!rt_rq->rt_nr_running))
@@ -505,8 +501,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
-		if (unlikely(!rt_se))
-			goto retry;
+		BUG_ON(!rt_se);
 		rt_rq = group_rt_rq(rt_se);
 	} while (rt_rq);
 
-- 
cgit v1.1


From 9745512ce79de686df354dc70a8d1a74d801892d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: latencytop support

LatencyTOP kernel infrastructure; it measures latencies in the
scheduler and tracks it system wide and per process.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile     |   1 +
 kernel/fork.c       |   1 +
 kernel/latencytop.c | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c |   8 +-
 kernel/sysctl.c     |  10 +++
 5 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 kernel/latencytop.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 68755cd..390d421 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c969f4..39d22b3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1205,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
+	clear_all_latency_tracing(p);
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 0000000..b4e3c85
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!latencytop_enabled)
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&p->latency_record, 0, sizeof(p->latency_record));
+	p->latency_record_count = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&latency_record, 0, sizeof(latency_record));
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+	int firstnonnull = MAXLR + 1;
+	int i;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* skip kernel threads for now */
+	if (!tsk->mm)
+		return;
+
+	for (i = 0; i < MAXLR; i++) {
+		int q;
+		int same = 1;
+		/* Nothing stored: */
+		if (!latency_record[i].backtrace[0]) {
+			if (firstnonnull > i)
+				firstnonnull = i;
+			continue;
+		}
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (latency_record[i].backtrace[q] !=
+				lat->backtrace[q])
+				same = 0;
+			if (same && lat->backtrace[q] == 0)
+				break;
+			if (same && lat->backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			latency_record[i].count++;
+			latency_record[i].time += lat->time;
+			if (lat->time > latency_record[i].max)
+				latency_record[i].max = lat->time;
+			return;
+		}
+	}
+
+	i = firstnonnull;
+	if (i >= MAXLR - 1)
+		return;
+
+	/* Allocted a new one: */
+	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+	struct stack_trace trace;
+
+	memset(&trace, 0, sizeof(trace));
+	trace.max_entries = LT_BACKTRACEDEPTH;
+	trace.entries = &lat->backtrace[0];
+	trace.skip = 0;
+	save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+	unsigned long flags;
+	int i, q;
+	struct latency_record lat;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* Long interruptible waits are generally user requested... */
+	if (inter && usecs > 5000)
+		return;
+
+	memset(&lat, 0, sizeof(lat));
+	lat.count = 1;
+	lat.time = usecs;
+	lat.max = usecs;
+	store_stacktrace(tsk, &lat);
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	account_global_scheduler_latency(tsk, &lat);
+
+	/*
+	 * short term hack; if we're > 32 we stop; future we recycle:
+	 */
+	tsk->latency_record_count++;
+	if (tsk->latency_record_count >= LT_SAVECOUNT)
+		goto out_unlock;
+
+	for (i = 0; i < LT_SAVECOUNT ; i++) {
+		struct latency_record *mylat;
+		int same = 1;
+		mylat = &tsk->latency_record[i];
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (mylat->backtrace[q] !=
+				lat.backtrace[q])
+				same = 0;
+			if (same && lat.backtrace[q] == 0)
+				break;
+			if (same && lat.backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			mylat->count++;
+			mylat->time += lat.time;
+			if (lat.time > mylat->max)
+				mylat->max = lat.time;
+			goto out_unlock;
+		}
+	}
+
+	/* Allocated a new one: */
+	i = tsk->latency_record_count;
+	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < MAXLR; i++) {
+		if (latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				latency_record[i].count,
+				latency_record[i].time,
+				latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!latency_record[i].backtrace[q])
+					break;
+				if (latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+	}
+	return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	clear_global_latency_tracing();
+
+	return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("latency_stats", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &lstats_fops;
+
+	return 0;
+}
+__initcall(init_lstats_procfs);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3dab1ff..1b3b40a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
+#include <linux/latencytop.h>
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
+
+		account_scheduler_latency(tsk, delta >> 10, 1);
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * time that the task spent sleeping:
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
-			struct task_struct *tsk = task_of(se);
 
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 		}
+		account_scheduler_latency(tsk, delta >> 10, 0);
 	}
 #endif
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3afbd25..5418ef6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int audit_argv_kb;
+extern int latencytop_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -416,6 +417,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec_taint,
 	},
 #endif
+#ifdef CONFIG_LATENCYTOP
+	{
+		.procname	= "latencytop",
+		.data		= &latencytop_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SECURITY_CAPABILITIES
 	{
 		.procname	= "cap-bound",
-- 
cgit v1.1


From 90739081ef8d5495d50abba9c5d333be9acd872a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: softlockup: fix signedness

fix softlockup tunables signedness.

mark tunables read-mostly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 10 +++++-----
 kernel/sysctl.c     | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 02f0ad5..c1d7655 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -24,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
-static int did_panic;
-int softlockup_thresh = 60;
+static int __read_mostly did_panic;
+unsigned long __read_mostly softlockup_thresh = 60;
 
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -121,14 +121,14 @@ void softlockup_tick(void)
 /*
  * Have a reasonable limit on the number of tasks checked:
  */
-unsigned long sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
 
 /*
  * Zero means infinite timeout - no checking done:
  */
-unsigned long sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
 
-long sysctl_hung_task_warnings = 10;
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
 /*
  * Only do the hung-tasks check on one CPU:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5418ef6..8e96558 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -772,9 +772,9 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &one,
 		.extra2		= &sixty,
@@ -783,27 +783,27 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
 		.data		= &sysctl_hung_task_check_count,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_timeout_secs",
 		.data		= &sysctl_hung_task_timeout_secs,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_warnings",
 		.data		= &sysctl_hung_task_warnings,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 	},
 #endif
-- 
cgit v1.1


From 19ef9309273d26cb005cb23e6a370353dca91099 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: printk: use ktime_get()

printk timestamps: use ktime_get().

Some platforms have a functioning clocksource function only after
they are done with early bootup, so delay this until out of
SYSTEM_BOOTING state.

it's also inherently safe now, as any bugs in this area will be
caught by the printk recursion checks.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968..423a8c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -702,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = cpu_clock(printk_cpu);
+				t = 0;
+				if (system_state != SYSTEM_BOOTING)
+					t = ktime_to_ns(ktime_get());
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.1


From 5fb5e6de55860a99c2d8fe7e0c8222d5c53d8464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: print backtrace of running tasks too

The attached patch is something really simple that can sometimes help
in getting more info out of a hung system.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4d3a5a7..524285e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5161,8 +5161,7 @@ void sched_show_task(struct task_struct *p)
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 
-	if (state != TASK_RUNNING)
-		show_stack(p, NULL);
+	show_stack(p, NULL);
 }
 
 void show_state_filter(unsigned long state_filter)
-- 
cgit v1.1


From 5973e5b954848c63855a357ad4ff39882e3904f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: fix: don't take a mutex from interrupt context

print_cfs_stats is callable from interrupt context (sysrq), hence it should
not take mutexes. Change it to use RCU since the task group data is RCU
freed anyway.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1b3b40a..45ff4e9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1434,9 +1434,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
-	lock_task_group_list();
+	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
-	unlock_task_group_list();
+	rcu_read_unlock();
 }
 #endif
-- 
cgit v1.1


From 6d082592b62689fb91578d0338d04a9f50991990 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:35 +0100
Subject: sched: keep total / count stats in addition to the max for

Right now, the linux kernel (with scheduler statistics enabled) keeps track
of the maximum time a process is waiting to be scheduled. While the maximum
is a very useful metric, tracking average and total is equally useful
(at least for latencytop) to figure out the accumulated effect of scheduler
delays. The accumulated effect is important to judge the performance impact
of scheduler tuning/behavior.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 4 ++++
 kernel/sched_fair.c  | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9e5de09..4b5e24c 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -300,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
+	PN(se.wait_sum);
+	P(se.wait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
 	P(se.nr_migrations_cold);
@@ -367,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_max				= 0;
+	p->se.wait_sum				= 0;
+	p->se.wait_count			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_max				= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 45ff4e9..72e25c7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -385,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	schedstat_set(se->wait_max, max(se->wait_max,
 			rq_of(cfs_rq)->clock - se->wait_start));
+	schedstat_set(se->wait_count, se->wait_count + 1);
+	schedstat_set(se->wait_sum, se->wait_sum +
+			rq_of(cfs_rq)->clock - se->wait_start);
 	schedstat_set(se->wait_start, 0);
 }
 
-- 
cgit v1.1


From 81ef16e763bb899053e06f6050603a305456a085 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 26 Jan 2008 14:11:06 +0100
Subject: [S390] Remove appldata include from sysctl_check.c

Forgot to remove this when removing the appldata binary sysctls.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sysctl_check.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index a68425a..d8a5558 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,5 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../arch/s390/appldata/appldata.h"
 #include "../fs/xfs/linux-2.6/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
-- 
cgit v1.1


From 326e96b92306b7af24a3608ec01156cba17a3fc1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 27 Jan 2008 08:03:54 +0100
Subject: printk: revert ktime_get() timestamps

revert 19ef9309273d26cb005cb23e6a370353dca91099.

Kevin Winchester reported a lockup during X startup an bisected
it to this commit.

Reported-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 423a8c7..3b7c968 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -702,9 +702,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = 0;
-				if (system_state != SYSTEM_BOOTING)
-					t = ktime_to_ns(ktime_get());
+				t = cpu_clock(printk_cpu);
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
-- 
cgit v1.1


From fd0928df98b9578be8a786ac0cb78a47a5e17a20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:52:45 +0100
Subject: ioprio: move io priority from task_struct to io_context

This is where it belongs and then it doesn't take up space for a
process that doesn't do IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/fork.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 39d22b3..2a86c9d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
 #include <linux/random.h>
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
+#include <linux/blkdev.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -791,6 +792,26 @@ out:
 	return error;
 }
 
+static int copy_io(struct task_struct *tsk)
+{
+#ifdef CONFIG_BLOCK
+	struct io_context *ioc = current->io_context;
+
+	if (!ioc)
+		return 0;
+
+	if (ioprio_valid(ioc->ioprio)) {
+		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+
+		tsk->io_context->task = tsk;
+		tsk->io_context->ioprio = ioc->ioprio;
+	}
+#endif
+	return 0;
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
@@ -1156,15 +1177,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
+	if ((retval = copy_io(p)))
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-		goto bad_fork_cleanup_namespaces;
+		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
 		retval = -ENOMEM;
 		pid = alloc_pid(task_active_pid_ns(p));
 		if (!pid)
-			goto bad_fork_cleanup_namespaces;
+			goto bad_fork_cleanup_io;
 
 		if (clone_flags & CLONE_NEWPID) {
 			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1234,9 +1257,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
-	/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
-	p->ioprio = current->ioprio;
-
 	/*
 	 * The task hasn't been attached yet, so its cpus_allowed mask will
 	 * not be changed, nor will its assigned CPU.
@@ -1328,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_io:
+	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_keys:
-- 
cgit v1.1


From d38ecf935fcb10264a6bc190855d9595165e6eeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:53:35 +0100
Subject: io context sharing: preliminary support

Detach task state from ioc, instead keep track of how many processes
are accessing the ioc.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2a86c9d..1987c57 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -805,7 +805,6 @@ static int copy_io(struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 
-		tsk->io_context->task = tsk;
 		tsk->io_context->ioprio = ioc->ioprio;
 	}
 #endif
-- 
cgit v1.1


From fadad878cc0640cc9cd5569998bf54b693f7b38b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 24 Jan 2008 08:54:47 +0100
Subject: kernel: add CLONE_IO to specifically request sharing of IO contexts

syslets (or other threads/processes that want io context sharing) can
set this to enforce sharing of io context.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/fork.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1987c57..314f510 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,15 +792,21 @@ out:
 	return error;
 }
 
-static int copy_io(struct task_struct *tsk)
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
 
 	if (!ioc)
 		return 0;
-
-	if (ioprio_valid(ioc->ioprio)) {
+	/*
+	 * Share io context with parent, if CLONE_IO is set
+	 */
+	if (clone_flags & CLONE_IO) {
+		tsk->io_context = ioc_task_link(ioc);
+		if (unlikely(!tsk->io_context))
+			return -ENOMEM;
+	} else if (ioprio_valid(ioc->ioprio)) {
 		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
@@ -1176,7 +1182,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
 		goto bad_fork_cleanup_keys;
-	if ((retval = copy_io(p)))
+	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
-- 
cgit v1.1


From 29e796fd4de54b8f5bc30d897611210ece4fd0f2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:50:18 +1100
Subject: sysctl: Add register_sysctl_paths function

There are a number of modules that register a sysctl table
somewhere deeply nested in the sysctl hierarchy, such as
fs/nfs, fs/xfs, dev/cdrom, etc.

They all specify several dummy ctl_tables for the path name.
This patch implements register_sysctl_path that takes
an additional path name, and makes up dummy sysctl nodes
for each component.

This patch was originally written by Olaf Kirch and
brought to my attention and reworked some by Olaf Hering.
I have changed a few additional things so the bugs are mine.

After converting all of the easy callers Olaf Hering observed
allyesconfig ARCH=i386, the patch reduces the final binary size by 9369 bytes.

.text +897
.data -7008

   text    data     bss     dec     hex filename
   26959310        4045899 4718592 35723801        2211a19 ../vmlinux-vanilla
   26960207        4038891 4718592 35717690        221023a ../O-allyesconfig/vmlinux

So this change is both a space savings and a code simplification.

CC: Olaf Kirch <okir@suse.de>
CC: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e96558..f5805423 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1561,11 +1561,12 @@ static __init int sysctl_init(void)
 core_initcall(sysctl_init);
 
 /**
- * register_sysctl_table - register a sysctl hierarchy
+ * register_sysctl_paths - register a sysctl hierarchy
+ * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
  * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. An entry with a ctl_name of 0 terminates the table. 
+ * array. A completely 0 filled entry terminates the table.
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
@@ -1628,25 +1629,76 @@ core_initcall(sysctl_init);
  * This routine returns %NULL on a failure to register, and a pointer
  * to the table header on success.
  */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
 {
-	struct ctl_table_header *tmp;
-	tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
-	if (!tmp)
+	struct ctl_table_header *header;
+	struct ctl_table *new, **prevp;
+	unsigned int n, npath;
+
+	/* Count the path components */
+	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+		;
+
+	/*
+	 * For each path component, allocate a 2-element ctl_table array.
+	 * The first array element will be filled with the sysctl entry
+	 * for this, the second will be the sentinel (ctl_name == 0).
+	 *
+	 * We allocate everything in one go so that we don't have to
+	 * worry about freeing additional memory in unregister_sysctl_table.
+	 */
+	header = kzalloc(sizeof(struct ctl_table_header) +
+			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+	if (!header)
 		return NULL;
-	tmp->ctl_table = table;
-	INIT_LIST_HEAD(&tmp->ctl_entry);
-	tmp->used = 0;
-	tmp->unregistering = NULL;
-	sysctl_set_parent(NULL, table);
-	if (sysctl_check_table(tmp->ctl_table)) {
-		kfree(tmp);
+
+	new = (struct ctl_table *) (header + 1);
+
+	/* Now connect the dots */
+	prevp = &header->ctl_table;
+	for (n = 0; n < npath; ++n, ++path) {
+		/* Copy the procname */
+		new->procname = path->procname;
+		new->ctl_name = path->ctl_name;
+		new->mode     = 0555;
+
+		*prevp = new;
+		prevp = &new->child;
+
+		new += 2;
+	}
+	*prevp = table;
+
+	INIT_LIST_HEAD(&header->ctl_entry);
+	header->used = 0;
+	header->unregistering = NULL;
+	sysctl_set_parent(NULL, header->ctl_table);
+	if (sysctl_check_table(header->ctl_table)) {
+		kfree(header);
 		return NULL;
 	}
 	spin_lock(&sysctl_lock);
-	list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	list_add_tail(&header->ctl_entry, &root_table_header.ctl_entry);
 	spin_unlock(&sysctl_lock);
-	return tmp;
+
+	return header;
+}
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_paths(null_path, table);
 }
 
 /**
@@ -1675,6 +1727,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 	return NULL;
 }
 
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						    struct ctl_table *table)
+{
+	return NULL;
+}
+
 void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
@@ -2733,6 +2791,7 @@ EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(register_sysctl_paths);
 EXPORT_SYMBOL(sysctl_intvec);
 EXPORT_SYMBOL(sysctl_jiffies);
 EXPORT_SYMBOL(sysctl_ms_jiffies);
-- 
cgit v1.1


From 23eb06de7d2d333a0f7ebba2da663e00c9c9483e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:52:10 +1100
Subject: sysctl: Remember the ctl_table we passed to register_sysctl_paths

By doing this we allow users of register_sysctl_paths that build
and dynamically allocate their ctl_table to be simpler.  This allows
them to just remember the ctl_table_header returned from
register_sysctl_paths from which they can now find the
ctl_table array they need to free.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5805423..89b7d95 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1669,6 +1669,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 		new += 2;
 	}
 	*prevp = table;
+	header->ctl_table_arg = table;
 
 	INIT_LIST_HEAD(&header->ctl_entry);
 	header->used = 0;
-- 
cgit v1.1


From e51b6ba077791f2f8c876022b37419be7a2ceec3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 30 Nov 2007 23:54:00 +1100
Subject: sysctl: Infrastructure for per namespace sysctls

This patch implements the basic infrastructure for per namespace sysctls.

A list of lists of sysctl headers is added, allowing each namespace to have
it's own list of sysctl headers.

Each list of sysctl headers has a lookup function to find the first
sysctl header in the list, allowing the lists to have a per namespace
instance.

register_sysct_root is added to tell sysctl.c about additional
lists of sysctl_headers.  As all of the users are expected to be in
kernel no unregister function is provided.

sysctl_head_next is updated to walk through the list of lists.

__register_sysctl_paths is added to add a new sysctl table on
a non-default sysctl list.

The only intrusive part of this patch is propagating the information
to decided which list of sysctls to use for sysctl_check_table.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Daniel Lezcano <dlezcano@fr.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c       | 93 +++++++++++++++++++++++++++++++++++++++++++++------
 kernel/sysctl_check.c | 25 ++++++++------
 2 files changed, 96 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89b7d95..45e76f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -157,8 +157,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
 #endif
 
 static struct ctl_table root_table[];
-static struct ctl_table_header root_table_header =
-	{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+	.ctl_table = root_table,
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+	.root = &sysctl_table_root,
+};
+static struct ctl_table_root sysctl_table_root = {
+	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+	.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
 
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
@@ -1371,12 +1379,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
+	struct list_head *header_list;
+	header_list = &root->header_list;
+	if (root->lookup)
+		header_list = root->lookup(root, namespaces);
+	return header_list;
+}
+
+struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+					    struct ctl_table_header *prev)
+{
+	struct ctl_table_root *root;
+	struct list_head *header_list;
 	struct ctl_table_header *head;
 	struct list_head *tmp;
+
 	spin_lock(&sysctl_lock);
 	if (prev) {
+		head = prev;
 		tmp = &prev->ctl_entry;
 		unuse_table(prev);
 		goto next;
@@ -1390,14 +1413,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
 		spin_unlock(&sysctl_lock);
 		return head;
 	next:
+		root = head->root;
 		tmp = tmp->next;
-		if (tmp == &root_table_header.ctl_entry)
-			break;
+		header_list = lookup_header_list(root, namespaces);
+		if (tmp != header_list)
+			continue;
+
+		do {
+			root = list_entry(root->root_list.next,
+					struct ctl_table_root, root_list);
+			if (root == &sysctl_table_root)
+				goto out;
+			header_list = lookup_header_list(root, namespaces);
+		} while (list_empty(header_list));
+		tmp = header_list->next;
 	}
+out:
 	spin_unlock(&sysctl_lock);
 	return NULL;
 }
 
+struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+	return __sysctl_head_next(current->nsproxy, prev);
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+	spin_lock(&sysctl_lock);
+	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+	spin_unlock(&sysctl_lock);
+}
+
 #ifdef CONFIG_SYSCTL_SYSCALL
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
 	       void __user *newval, size_t newlen)
@@ -1554,14 +1601,16 @@ static __init int sysctl_init(void)
 {
 	int err;
 	sysctl_set_parent(NULL, root_table);
-	err = sysctl_check_table(root_table);
+	err = sysctl_check_table(current->nsproxy, root_table);
 	return 0;
 }
 
 core_initcall(sysctl_init);
 
 /**
- * register_sysctl_paths - register a sysctl hierarchy
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
  * @path: The path to the directory the sysctl table is in.
  * @table: the top-level table structure
  *
@@ -1629,9 +1678,12 @@ core_initcall(sysctl_init);
  * This routine returns %NULL on a failure to register, and a pointer
  * to the table header on success.
  */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_root *root,
+	struct nsproxy *namespaces,
+	const struct ctl_path *path, struct ctl_table *table)
 {
+	struct list_head *header_list;
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
@@ -1674,19 +1726,38 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 	INIT_LIST_HEAD(&header->ctl_entry);
 	header->used = 0;
 	header->unregistering = NULL;
+	header->root = root;
 	sysctl_set_parent(NULL, header->ctl_table);
-	if (sysctl_check_table(header->ctl_table)) {
+	if (sysctl_check_table(namespaces, header->ctl_table)) {
 		kfree(header);
 		return NULL;
 	}
 	spin_lock(&sysctl_lock);
-	list_add_tail(&header->ctl_entry, &root_table_header.ctl_entry);
+	header_list = lookup_header_list(root, namespaces);
+	list_add_tail(&header->ctl_entry, header_list);
 	spin_unlock(&sysctl_lock);
 
 	return header;
 }
 
 /**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
+{
+	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+					path, table);
+}
+
+/**
  * register_sysctl_table - register a sysctl table hierarchy
  * @table: the top-level table structure
  *
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index d8a5558..c3206fa 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1342,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
 	}
 }
 
-static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
+static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
+						struct ctl_table *table)
 {
 	struct ctl_table_header *head;
 	struct ctl_table *ref, *test;
@@ -1350,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
 
 	depth = sysctl_depth(table);
 
-	for (head = sysctl_head_next(NULL); head;
-	     head = sysctl_head_next(head)) {
+	for (head = __sysctl_head_next(namespaces, NULL); head;
+	     head = __sysctl_head_next(namespaces, head)) {
 		cur_depth = depth;
 		ref = head->ctl_table;
 repeat:
@@ -1396,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
 	*fail = str;
 }
 
-static int sysctl_check_dir(struct ctl_table *table)
+static int sysctl_check_dir(struct nsproxy *namespaces,
+				struct ctl_table *table)
 {
 	struct ctl_table *ref;
 	int error;
 
 	error = 0;
-	ref = sysctl_check_lookup(table);
+	ref = sysctl_check_lookup(namespaces, table);
 	if (ref) {
 		int match = 0;
 		if ((!table->procname && !ref->procname) ||
@@ -1427,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
 	return error;
 }
 
-static void sysctl_check_leaf(struct ctl_table *table, const char **fail)
+static void sysctl_check_leaf(struct nsproxy *namespaces,
+				struct ctl_table *table, const char **fail)
 {
 	struct ctl_table *ref;
 
-	ref = sysctl_check_lookup(table);
+	ref = sysctl_check_lookup(namespaces, table);
 	if (ref && (ref != table))
 		set_fail(fail, table, "Sysctl already exists");
 }
@@ -1455,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
 	}
 }
 
-int sysctl_check_table(struct ctl_table *table)
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 {
 	int error = 0;
 	for (; table->ctl_name || table->procname; table++) {
@@ -1485,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
 				set_fail(&fail, table, "Directory with extra1");
 			if (table->extra2)
 				set_fail(&fail, table, "Directory with extra2");
-			if (sysctl_check_dir(table))
+			if (sysctl_check_dir(namespaces, table))
 				set_fail(&fail, table, "Inconsistent directory names");
 		} else {
 			if ((table->strategy == sysctl_data) ||
@@ -1534,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
 			if (!table->procname && table->proc_handler)
 				set_fail(&fail, table, "proc_handler without procname");
 #endif
-			sysctl_check_leaf(table, &fail);
+			sysctl_check_leaf(namespaces, table, &fail);
 		}
 		sysctl_check_bin_path(table, &fail);
 		if (fail) {
@@ -1542,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
 			error = -EINVAL;
 		}
 		if (table->child)
-			error |= sysctl_check_table(table->child);
+			error |= sysctl_check_table(namespaces, table->child);
 	}
 	return error;
 }
-- 
cgit v1.1


From 08913681e484f3f0db949dd0809012e089846216 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 5 Dec 2007 01:42:49 -0800
Subject: [NET]: Remove the empty net_table

I have removed all the entries from this table (core_table,
ipv4_table and tr_table), so now we can safely drop it.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e76f2..4bc8e48 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -200,14 +200,6 @@ static struct ctl_table root_table[] = {
 		.mode		= 0555,
 		.child		= vm_table,
 	},
-#ifdef CONFIG_NET
-	{
-		.ctl_name	= CTL_NET,
-		.procname	= "net",
-		.mode		= 0555,
-		.child		= net_table,
-	},
-#endif
 	{
 		.ctl_name	= CTL_FS,
 		.procname	= "fs",
-- 
cgit v1.1


From a2da4052f1df6bc77749f84496fe731ab8b458f7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:17 -0500
Subject: module: Don't report discarded init pages as kernel text.

Current code could cause a bug in symbol_put_addr() if an arch used
kmalloc module text: we might think the symbol belongs to the core
kernel.

The downside is that this might make backtraces through (discarded)
init functions harder to read on some archs, but we already have that
issue for modules and noone has complained.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/extable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 7fe2628..a26cb2e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
 	    addr <= (unsigned long)_etext)
 		return 1;
 
-	if (addr >= (unsigned long)_sinittext &&
+	if (system_state == SYSTEM_BOOTING &&
+	    addr >= (unsigned long)_sinittext &&
 	    addr <= (unsigned long)_einittext)
 		return 1;
 	return 0;
-- 
cgit v1.1


From c9a3ba55bb5da03fc7d707709a7fe078fe1aa0a0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:18 -0500
Subject: module: wait for dependent modules doing init.

There have been reports of modules failing to load because the modules
they depend on are still loading.  This changes the modules to wait
for a reasonable length of time in that case.  We time out eventually,
because there can be module loops or broken modules.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1bb4c5e..1731469 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -65,6 +65,9 @@
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
 int register_module_notifier(struct notifier_block * nb)
@@ -84,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier);
 static inline int strong_try_module_get(struct module *mod)
 {
 	if (mod && mod->state == MODULE_STATE_COMING)
+		return -EBUSY;
+	if (try_module_get(mod))
 		return 0;
-	return try_module_get(mod);
+	else
+		return -ENOENT;
 }
 
 static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -539,11 +545,21 @@ static int already_uses(struct module *a, struct module *b)
 static int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
-	int no_warn;
+	int no_warn, err;
 
 	if (b == NULL || already_uses(a, b)) return 1;
 
-	if (!strong_try_module_get(b))
+	/* If we're interrupted or time out, we fail. */
+	if (wait_event_interruptible_timeout(
+		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
+		    30 * HZ) <= 0) {
+		printk("%s: gave up waiting for init of module %s.\n",
+		       a->name, b->name);
+		return 0;
+	}
+
+	/* If strong_try_module_get() returned a different error, we fail. */
+	if (err)
 		return 0;
 
 	DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -816,7 +832,7 @@ static inline void module_unload_free(struct module *mod)
 
 static inline int use_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b);
+	return strong_try_module_get(b) == 0;
 }
 
 static inline void module_unload_init(struct module *mod)
@@ -1326,7 +1342,7 @@ void *__symbol_get(const char *symbol)
 
 	preempt_disable();
 	value = __find_symbol(symbol, &owner, &crc, 1);
-	if (value && !strong_try_module_get(owner))
+	if (value && strong_try_module_get(owner) != 0)
 		value = 0;
 	preempt_enable();
 
@@ -2132,6 +2148,7 @@ sys_init_module(void __user *umod,
 		mutex_lock(&module_mutex);
 		free_module(mod);
 		mutex_unlock(&module_mutex);
+		wake_up(&module_wq);
 		return ret;
 	}
 
@@ -2146,6 +2163,7 @@ sys_init_module(void __user *umod,
 	mod->init_size = 0;
 	mod->init_text_size = 0;
 	mutex_unlock(&module_mutex);
+	wake_up(&module_wq);
 
 	return 0;
 }
-- 
cgit v1.1


From efa5345e39d01deef349c120f55ac6b6eabe7457 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:20 -0500
Subject: module: Fix gratuitous sprintf in module.c

Andrew sent an older version of this patch: we shouldn't use sprintf
to copy a string.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1731469..276abd7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -738,7 +738,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mutex_lock(&module_mutex);
 	}
 	/* Store the name of the last unloaded module for diagnostic purposes */
-	sprintf(last_unloaded_module, mod->name);
+	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
 	free_module(mod);
 
  out:
-- 
cgit v1.1


From bb9d3d56e792d2619cc0903df4ac01d86ac1261d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:21 -0500
Subject: module: better OOPS and lockdep coverage for loading modules

If we put the module in the linked list *before* calling into to, we
get the module name and functions in the OOPS (is_module_address can
find the module).  It also helps lockdep in a similar way.

Acked-and-tested-by: Joern Engel <joern@lazybastard.org>
Tested-by: Erez Zadok <ezk@cs.sunysb.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 276abd7..12067ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1294,6 +1294,17 @@ static void mod_kobject_remove(struct module *mod)
 }
 
 /*
+ * link the module with the whole machine is stopped with interrupts off
+ * - this defends against kallsyms not taking locks
+ */
+static int __link_module(void *_mod)
+{
+	struct module *mod = _mod;
+	list_add(&mod->list, &modules);
+	return 0;
+}
+
+/*
  * unlink the module with the whole machine is stopped with interrupts off
  * - this defends against kallsyms not taking locks
  */
@@ -2035,6 +2046,11 @@ static struct module *load_module(void __user *umod,
 		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
 		       mod->name);
 
+	/* Now sew it into the lists so we can get lockdep and oops
+         * info during argument parsing.  Noone should access us, since
+         * strong_try_module_get() will fail. */
+	stop_machine_run(__link_module, mod, NR_CPUS);
+
 	/* Size of section 0 is 0, so this works well if no params */
 	err = parse_args(mod->name, mod->args,
 			 (struct kernel_param *)
@@ -2043,7 +2059,7 @@ static struct module *load_module(void __user *umod,
 			 / sizeof(struct kernel_param),
 			 NULL);
 	if (err < 0)
-		goto arch_cleanup;
+		goto unlink;
 
 	err = mod_sysfs_setup(mod,
 			      (struct kernel_param *)
@@ -2051,7 +2067,7 @@ static struct module *load_module(void __user *umod,
 			      sechdrs[setupindex].sh_size
 			      / sizeof(struct kernel_param));
 	if (err < 0)
-		goto arch_cleanup;
+		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 	add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
 
@@ -2066,7 +2082,8 @@ static struct module *load_module(void __user *umod,
 	/* Done! */
 	return mod;
 
- arch_cleanup:
+ unlink:
+	stop_machine_run(__unlink_module, mod, NR_CPUS);
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
@@ -2091,17 +2108,6 @@ static struct module *load_module(void __user *umod,
 	goto free_hdr;
 }
 
-/*
- * link the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __link_module(void *_mod)
-{
-	struct module *mod = _mod;
-	list_add(&mod->list, &modules);
-	return 0;
-}
-
 /* This is where the real work happens */
 asmlinkage long
 sys_init_module(void __user *umod,
@@ -2126,10 +2132,6 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
-	/* Now sew it into the lists.  They won't access us, since
-           strong_try_module_get() will fail. */
-	stop_machine_run(__link_module, mod, NR_CPUS);
-
 	/* Drop lock so they can recurse */
 	mutex_unlock(&module_mutex);
 
-- 
cgit v1.1


From 6dd06c9fbe025f542bce4cdb91790c0f91962722 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 29 Jan 2008 17:13:22 -0500
Subject: module: make module_address_lookup safe

module_address_lookup releases preemption then returns a pointer into
the module space.  The only user (kallsyms) copies the result, so just
do that under the preempt disable.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/kallsyms.c | 11 ++++-------
 kernel/module.c   | 22 +++++++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2fc2581..7dadc71 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
 int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
 				unsigned long *offset)
 {
+	char namebuf[KSYM_NAME_LEN];
 	if (is_ksym_addr(addr))
 		return !!get_symbol_pos(addr, symbolsize, offset);
 
-	return !!module_address_lookup(addr, symbolsize, offset, NULL);
+	return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
 }
 
 /*
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
 			    unsigned long *offset,
 			    char **modname, char *namebuf)
 {
-	const char *msym;
-
 	namebuf[KSYM_NAME_LEN - 1] = 0;
 	namebuf[0] = 0;
 
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
 	}
 
 	/* see if it's in a module */
-	msym = module_address_lookup(addr, symbolsize, offset, modname);
-	if (msym)
-		return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
-
+	return module_address_lookup(addr, symbolsize, offset, modname,
+				     namebuf);
 	return NULL;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 12067ff..e814cd7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2230,14 +2230,13 @@ static const char *get_ksymbol(struct module *mod,
 	return mod->strtab + mod->symtab[best].st_name;
 }
 
-/* For kallsyms to ask for address resolution.  NULL means not found.
-   We don't lock, as this is used for oops resolution and races are a
-   lesser concern. */
-/* FIXME: Risky: returns a pointer into a module w/o lock */
-const char *module_address_lookup(unsigned long addr,
-				  unsigned long *size,
-				  unsigned long *offset,
-				  char **modname)
+/* For kallsyms to ask for address resolution.  NULL means not found.  Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption. */
+char *module_address_lookup(unsigned long addr,
+			    unsigned long *size,
+			    unsigned long *offset,
+			    char **modname,
+			    char *namebuf)
 {
 	struct module *mod;
 	const char *ret = NULL;
@@ -2252,8 +2251,13 @@ const char *module_address_lookup(unsigned long addr,
 			break;
 		}
 	}
+	/* Make a copy in here where it's safe */
+	if (ret) {
+		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+		ret = namebuf;
+	}
 	preempt_enable();
-	return ret;
+	return (char *)ret;
 }
 
 int lookup_module_symbol_name(unsigned long addr, char *symname)
-- 
cgit v1.1


From 8686c99875f3849047660a5b6d02438443f22ce7 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Mon, 21 Jan 2008 17:08:25 +0800
Subject: module: fix the module name length in param_sysfs_builtin

the original code use KOBJ_NAME_LEN for built-in module name length,
that's defined to 20 in linux/kobject.h, but this is not enough appearntly,
many module names are longer than this;
 #define KOBJ_NAME_LEN                   20

another macro is MODULE_NAME_LEN defined in linux/module.h, I think this is
enough for module names:
 #define MODULE_NAME_LEN (64 - sizeof(unsigned long))

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 67f65ee..42fe5e6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 
 extern struct kernel_param __start___param[], __stop___param[];
 
-#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
-
 struct param_attribute
 {
 	struct module_attribute mattr;
@@ -587,7 +585,7 @@ static void __init param_sysfs_builtin(void)
 {
 	struct kernel_param *kp, *kp_begin = NULL;
 	unsigned int i, name_len, count = 0;
-	char modname[MAX_KBUILD_MODNAME + 1] = "";
+	char modname[MODULE_NAME_LEN + 1] = "";
 
 	for (i=0; i < __stop___param - __start___param; i++) {
 		char *dot;
@@ -595,12 +593,12 @@ static void __init param_sysfs_builtin(void)
 
 		kp = &__start___param[i];
 		max_name_len =
-			min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name));
+			min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
 
 		dot = memchr(kp->name, '.', max_name_len);
 		if (!dot) {
 			DEBUGP("couldn't find period in first %d characters "
-			       "of %s\n", MAX_KBUILD_MODNAME, kp->name);
+			       "of %s\n", MODULE_NAME_LEN, kp->name);
 			continue;
 		}
 		name_len = dot - kp->name;
-- 
cgit v1.1


From 0aa5bd52d0c49ca56d24584c646e6544ccbb3dc9 Mon Sep 17 00:00:00 2001
From: Jon Masters <jonathan@jonmasters.org>
Date: Mon, 21 Jan 2008 20:43:41 +0000
Subject: module: add module taint on ndiswrapper

The struct module taints member is supposed to store per-module taint
data. The kernel knows about certain specific external modules that will
taint the kernel, such as ndiswrapper. Use of ndiswrapper possibly
should set the per-module taint in addition to the global kernel
taint flag, unless we're arguing not because wrapper module itself
is not what actually causes the kernel to be tainted as such?

Signed-off-by: Jon Masters <jcm@jonmasters.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e814cd7..af3f81a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1916,7 +1916,7 @@ static struct module *load_module(void __user *umod,
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
 	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint(TAINT_PROPRIETARY_MODULE);
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-- 
cgit v1.1


From 6494a93d55fad586238cc1940e846c6d03e1aaf6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Sun, 27 Jan 2008 15:38:40 -0800
Subject: Module: check to see if we have a built in module with the same name

When trying to load a module with the same name as a built-in one, a
scary kobject backtrace comes up.  Prevent that from checking for this
condition and warning the user as to what exactly is going on.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index af3f81a..f6a4e72 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1230,6 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod)
 int mod_sysfs_init(struct module *mod)
 {
 	int err;
+	struct kobject *kobj;
 
 	if (!module_sysfs_initialized) {
 		printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1237,6 +1238,15 @@ int mod_sysfs_init(struct module *mod)
 		err = -EINVAL;
 		goto out;
 	}
+
+	kobj = kset_find_obj(module_kset, mod->name);
+	if (kobj) {
+		printk(KERN_ERR "%s: module is already loaded\n", mod->name);
+		kobject_put(kobj);
+		err = -EINVAL;
+		goto out;
+	}
+
 	mod->mkobj.mod = mod;
 
 	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-- 
cgit v1.1


From a6fa8e5a6172a5a5bc06ed04f34e50b36c978127 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: clean hungarian notation from timers

Clean up hungarian notation from timer code.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 80 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index f739dfb..aadfbc8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
-typedef struct tvec_s {
+struct tvec {
 	struct list_head vec[TVN_SIZE];
-} tvec_t;
+};
 
-typedef struct tvec_root_s {
+struct tvec_root {
 	struct list_head vec[TVR_SIZE];
-} tvec_root_t;
+};
 
-struct tvec_t_base_s {
+struct tvec_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
 	unsigned long timer_jiffies;
-	tvec_root_t tv1;
-	tvec_t tv2;
-	tvec_t tv3;
-	tvec_t tv4;
-	tvec_t tv5;
+	struct tvec_root tv1;
+	struct tvec tv2;
+	struct tvec tv3;
+	struct tvec tv4;
+	struct tvec tv5;
 } ____cacheline_aligned;
 
-typedef struct tvec_t_base_s tvec_base_t;
-
-tvec_base_t boot_tvec_bases;
+struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 
 /*
- * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
  * base in timer_list is guaranteed to be zero. Use the LSB for
  * the new flag to indicate whether the timer is deferrable
  */
 #define TBASE_DEFERRABLE_FLAG		(0x1)
 
 /* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
 	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
 
-static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 {
-	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+	return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+	timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
 				       TBASE_DEFERRABLE_FLAG));
 }
 
 static inline void
-timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
 {
-	timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+	timer->base = (struct tvec_base *)((unsigned long)(new_base) |
 				      tbase_get_deferrable(timer->base));
 }
 
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
 EXPORT_SYMBOL_GPL(round_jiffies_relative);
 
 
-static inline void set_running_timer(tvec_base_t *base,
+static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
-static tvec_base_t *lock_timer_base(struct timer_list *timer,
+static struct tvec_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 
 	for (;;) {
-		tvec_base_t *prelock_base = timer->base;
+		struct tvec_base *prelock_base = timer->base;
 		base = tbase_get_base(prelock_base);
 		if (likely(base != NULL)) {
 			spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
 
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *base, *new_base;
+	struct tvec_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = per_cpu(tvec_bases, cpu);
+	struct tvec_base *base = per_cpu(tvec_bases, cpu);
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	tvec_base_t *base;
+	struct tvec_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(tvec_base_t *base, tvec_t *tv, int index)
+static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(tvec_base_t *base)
+static inline void __run_timers(struct tvec_base *base)
 {
 	struct timer_list *timer;
 
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
  * is used on S/390 to stop all activity when a cpus is idle.
  * This functions needs to be called disabled.
  */
-static unsigned long __next_timer_interrupt(tvec_base_t *base)
+static unsigned long __next_timer_interrupt(struct tvec_base *base)
 {
 	unsigned long timer_jiffies = base->timer_jiffies;
 	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
-	tvec_t *varray[4];
+	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
 	index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
 	varray[3] = &base->tv5;
 
 	for (array = 0; array < 4; array++) {
-		tvec_t *varp = varray[array];
+		struct tvec *varp = varray[array];
 
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
  */
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
-	tvec_base_t *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
 	spin_lock(&base->lock);
@@ -894,7 +892,7 @@ static inline void calc_load(unsigned long ticks)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = __get_cpu_var(tvec_bases);
+	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
 	hrtimer_run_pending();
 
@@ -1223,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
 static int __cpuinit init_timers_cpu(int cpu)
 {
 	int j;
-	tvec_base_t *base;
+	struct tvec_base *base;
 	static char __cpuinitdata tvec_base_done[NR_CPUS];
 
 	if (!tvec_base_done[cpu]) {
@@ -1278,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
 {
 	struct timer_list *timer;
 
@@ -1292,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 
 static void __cpuinit migrate_timers(int cpu)
 {
-	tvec_base_t *old_base;
-	tvec_base_t *new_base;
+	struct tvec_base *old_base;
+	struct tvec_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-- 
cgit v1.1


From 4c9dc6412247abf4972080c51cd16a58c4009c19 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: timer cleanups

Small cleanups to tick-related code. Wrong preempt count is followed
by BUG(), so it is hardly KERN_WARNING.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 4 ++--
 kernel/timer.c           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1a21b6f..d36ee2f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(void)
 			/* Check, if the timer was already in the past */
 			if (hrtimer_active(&ts->sched_timer))
 				goto out;
-		} else if(!tick_program_event(expires, 0))
+		} else if (!tick_program_event(expires, 0))
 				goto out;
 		/*
 		 * We are past the event already. So we crossed a
@@ -507,7 +507,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
  */
 #ifdef CONFIG_HIGH_RES_TIMERS
 /*
- * We rearm the timer until we get disabled by the idle code
+ * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled and timer->base->cpu_base->lock held.
  */
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
diff --git a/kernel/timer.c b/kernel/timer.c
index aadfbc8..23f7ead 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -655,7 +655,7 @@ static inline void __run_timers(struct tvec_base *base)
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
+					printk(KERN_ERR "huh, entered %p "
 					       "with preempt_count %08x, exited"
 					       " with %08x?\n",
 					       fn, preempt_count,
-- 
cgit v1.1


From b10db7f0d2b589a7f88dc3026e150756cb437a28 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: more timer related cleanups

I was confused by FSEC = 10^15 NSEC statement, plus small whitespace
fixes. When there's copyright, there should be GPL.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c          | 4 +++-
 kernel/time/tick-sched.c  | 2 +-
 kernel/time/timer_stats.c | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4..8fe1ff4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
- * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *	Distribute under GPLv2.
+ *
+ *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  */
 
 #include <linux/module.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d36ee2f..49e12f6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *  Distribute under GPLv2.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7e..417da8c 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
  * the pid and cmdline from the owner process if applicable.
  *
  * Start/stop data collection:
- * # echo 1[0] >/proc/timer_stats
+ * # echo [1|0] >/proc/timer_stats
  *
  * Display the information collected so far:
  * # cat /proc/timer_stats
-- 
cgit v1.1


From 186e3cb8a465bac010ee3b020768d2fa2b505aef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: timer: clean up tick-broadcast.c

clean up tick-broadcast.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 7 ++-----
 kernel/time/tick-internal.h  | 2 --
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5b86698..e1bd50c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 /*
  * Broadcast the event to the cpus, which are set in the mask
  */
-int tick_do_broadcast(cpumask_t mask)
+static void tick_do_broadcast(cpumask_t mask)
 {
-	int ret = 0, cpu = smp_processor_id();
+	int cpu = smp_processor_id();
 	struct tick_device *td;
 
 	/*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
 		cpu_clear(cpu, mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->event_handler(td->evtdev);
-		ret = 1;
 	}
 
 	if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
 		cpu = first_cpu(mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->broadcast(mask);
-		ret = 1;
 	}
-	return ret;
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f27..f13f2b7 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  * Broadcasting support
  */
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_do_broadcast(cpumask_t mask);
-
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
 extern int tick_check_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
-- 
cgit v1.1


From efd9ac8630e89b9ee7ce64008bd7783952374f37 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: time: fold __get_realtime_clock_ts() into getnstimeofday()

  - getnstimeofday() was just a wrapper around __get_realtime_clock_ts()
  - Replace calls to __get_realtime_clock_ts() by calls to getnstimeofday()
  - Fix bogus reference to get_realtime_clock_ts(), which never existed

Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ab46ae8..7768019 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
 }
 
 /**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * getnstimeofday - Returns the time of day in a timespec
  * @ts:		pointer to the timespec to be set
  *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
+ * Returns the time of day in a timespec.
  */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
+void getnstimeofday(struct timespec *ts)
 {
 	unsigned long seq;
 	s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
 	timespec_add_ns(ts, nsecs);
 }
 
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts:		pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
-	__get_realtime_clock_ts(ts);
-}
-
 EXPORT_SYMBOL(getnstimeofday);
 
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
  *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
+ * NOTE: Users should be converted to using getnstimeofday()
  */
 void do_gettimeofday(struct timeval *tv)
 {
 	struct timespec now;
 
-	__get_realtime_clock_ts(&now);
+	getnstimeofday(&now);
 	tv->tv_sec = now.tv_sec;
 	tv->tv_usec = now.tv_nsec/1000;
 }
-- 
cgit v1.1


From 1077f5a917b7c630231037826b344b2f7f5b903f Mon Sep 17 00:00:00 2001
From: Parag Warudkar <parag.warudkar@gmail.com>
Date: Wed, 30 Jan 2008 13:30:01 +0100
Subject: clocksource.c: use init_timer_deferrable for clocksource_watchdog

clocksource_watchdog can use a deferrable timer - reduces wakeups from
idle per second.

Signed-off-by: Parag Warudkar <parag.warudkar@gmail.com>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 8d6125a..cabfa19 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -175,7 +175,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 			if (watchdog)
 				del_timer(&watchdog_timer);
 			watchdog = cs;
-			init_timer(&watchdog_timer);
+			init_timer_deferrable(&watchdog_timer);
 			watchdog_timer.function = clocksource_watchdog;
 
 			/* Reset watchdog cycles */
-- 
cgit v1.1


From 1ada5cba6a0318f90e45b38557e7b5206a9cba38 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: make clocksource watchdog cycle through online CPUs

This way it checks if the clocks are synchronized between CPUs too.
This might be able to detect slowly drifting TSCs which only
go wrong over longer time.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cabfa19..edd5ef8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
 	}
 
 	if (!list_empty(&watchdog_list)) {
-		__mod_timer(&watchdog_timer,
-			    watchdog_timer.expires + WATCHDOG_INTERVAL);
+		/* Cycle through CPUs to check if the CPUs stay synchronized to
+		 * each other. */
+		int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+		if (next_cpu >= NR_CPUS)
+			next_cpu = first_cpu(cpu_online_map);
+		watchdog_timer.expires += WATCHDOG_INTERVAL;
+		add_timer_on(&watchdog_timer, next_cpu);
 	}
 	spin_unlock(&watchdog_lock);
 }
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 		if (!started && watchdog) {
 			watchdog_last = watchdog->read();
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-			add_timer(&watchdog_timer);
+			add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
 		}
 	} else {
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				watchdog_last = watchdog->read();
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
-				add_timer(&watchdog_timer);
+				add_timer_on(&watchdog_timer,
+						first_cpu(cpu_online_map));
 			}
 		}
 	}
-- 
cgit v1.1


From 4713e22ce81eb8b3353e16435362eb3d0ec95640 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: add unregister function to disable unusable clocksources

On x86 the PIT might become an unusable clocksource. Add an unregister
function to provide a possibilty to remove the PIT from the list of
available clock sources.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/clocksource.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index edd5ef8..6e9259a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -337,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
+/**
+ * clocksource_unregister - remove a registered clocksource
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	list_del(&cs->list);
+	if (clocksource_override == cs)
+		clocksource_override = NULL;
+	next_clocksource = select_clocksource();
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
 #ifdef CONFIG_SYSFS
 /**
  * sysfs_show_current_clocksources - sysfs interface for current clocksource
-- 
cgit v1.1


From 45fe4fe19120a22f7339f5bb110447170c25fca9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: x86: make clockevents more robust

detect zero event-device multiplicators - they then cause
division-by-zero crashes if a clockevent has been initialized
incorrectly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5fb139f..3e59fce 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
 {
 	u64 clc = ((u64) latch << evt->shift);
 
+	if (unlikely(!evt->mult)) {
+		evt->mult = 1;
+		WARN_ON(1);
+	}
+
 	do_div(clc, evt->mult);
 	if (clc < 1000)
 		clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	/*
+	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+	 * on it, so fix it up and emit a warning:
+	 */
+	if (unlikely(!dev->mult)) {
+		dev->mult = 1;
+		WARN_ON(1);
+	}
 
 	spin_lock(&clockevents_lock);
 
-- 
cgit v1.1


From bbe4d18ac2e058c56adb0cd71f49d9ed3216a405 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: NTP: correct inconsistent ntp interval/tick_length usage

I recently noticed on one of my boxes that when synched with an NTP
server, the drift value reported for the system was ~283ppm. While in
some cases, clock hardware can be that bad, it struck me as unusual as
the system was using the acpi_pm clocksource, which is one of the more
trustworthy and accurate clocksources on x86 hardware.

I brought up another system and let it sync to the same NTP server, and
I noticed a similar 280some ppm drift.

In looking at the code, I found that the acpi_pm's constant frequency
was being computed correctly at boot-up, however once the system was up,
even without the ntp daemon running, the clocksource's frequency was
being modified by the clocksource_adjust() function.

Digging deeper, I realized that in the code that keeps track of how much
the clocksource is skewing from the ntp desired time, we were using
different lengths to establish how long an time interval was.

The clocksource was being setup with the following interval:
	NTP_INTERVAL_LENGTH = NSEC_PER_SEC/NTP_INTERVAL_FREQ

While the ntp code was using the tick_length_base value:
	tick_length_base ~= (tick_usec * NSEC_PER_USEC * USER_HZ)
					/NTP_INTERVAL_FREQ

The subtle difference is:
	(tick_usec * NSEC_PER_USEC * USER_HZ) != NSEC_PER_SEC

This difference in calculation was causing the clocksource correction
code to apply a correction factor to the clocksource so the two
intervals were the same, however this results in the actual frequency of
the clocksource to be made incorrect. I believe this difference would
affect all clocksources, although to differing degrees depending on the
clocksource resolution.

The issue was introduced when my HZ free ntp patch landed in 2.6.21-rc1,
so my apologies for the mistake, and for not noticing it until now.

The following patch, corrects the clocksource's initialization code so
it uses the same interval length as the code in ntp.c. After applying
this patch, the drift value for the same system went from ~283ppm to
only 2.635ppm.

I believe this patch to be good, however it does affect all arches and
I've only tested on x86, so some caution is advised. I do think it would
be a likely candidate for a stable 2.6.24.x release.

Any thoughts or feedback would be appreciated.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7768019..092a236 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -186,7 +186,8 @@ static void change_clocksource(void)
 
 	clock->error = 0;
 	clock->xtime_nsec = 0;
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+	clocksource_calculate_interval(clock,
+		(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
 
 	tick_clock_notify();
 
@@ -243,7 +244,8 @@ void __init timekeeping_init(void)
 	ntp_clear();
 
 	clock = clocksource_get_next();
-	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+	clocksource_calculate_interval(clock,
+		(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
 	clock->cycle_last = clocksource_read(clock);
 
 	xtime.tv_sec = sec;
-- 
cgit v1.1


From 6378ddb592158db4b42197f1bc8666228800e379 Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jan 2008 13:30:04 +0100
Subject: time: track accurate idle time with tick_sched.idle_sleeptime

Current idle time in kstat is based on jiffies and is coarse grained.
tick_sched.idle_sleeptime is making some attempt to keep track of idle time
in a fine grained manner.  But, it is not handling the time spent in
interrupts fully.

Make tick_sched.idle_sleeptime accurate with respect to time spent on
handling interrupts and also add tick_sched.idle_lastupdate, which keeps
track of last time when idle_sleeptime was updated.

This statistics will be crucial for cpufreq-ondemand governor, which can
shed some conservative gaurd band that is uses today while setting the
frequency.  The ondemand changes that uses the exact idle time is coming
soon.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c         |  7 ++++-
 kernel/time/tick-sched.c | 70 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 54 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8fe1ff4..d7837d4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -280,9 +280,14 @@ asmlinkage void do_softirq(void)
  */
 void irq_enter(void)
 {
+#ifdef CONFIG_NO_HZ
+	int cpu = smp_processor_id();
+	if (idle_cpu(cpu) && !in_interrupt())
+		tick_nohz_stop_idle(cpu);
+#endif
 	__irq_enter();
 #ifdef CONFIG_NO_HZ
-	if (idle_cpu(smp_processor_id()))
+	if (idle_cpu(cpu))
 		tick_nohz_update_jiffies();
 #endif
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 49e12f6..63f24b5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
 	local_irq_restore(flags);
 }
 
+void tick_nohz_stop_idle(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (ts->idle_active) {
+		ktime_t now, delta;
+		now = ktime_get();
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_lastupdate = now;
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		ts->idle_active = 0;
+	}
+}
+
+static ktime_t tick_nohz_start_idle(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, delta;
+
+	now = ktime_get();
+	if (ts->idle_active) {
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_lastupdate = now;
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	}
+	ts->idle_entrytime = now;
+	ts->idle_active = 1;
+	return now;
+}
+
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	return ktime_to_us(ts->idle_sleeptime);
+}
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
@@ -155,13 +193,14 @@ void tick_nohz_stop_sched_tick(void)
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
 	unsigned long rt_jiffies;
 	struct tick_sched *ts;
-	ktime_t last_update, expires, now, delta;
+	ktime_t last_update, expires, now;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	int cpu;
 
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
+	now = tick_nohz_start_idle(cpu);
 	ts = &per_cpu(tick_cpu_sched, cpu);
 
 	/*
@@ -193,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
 		}
 	}
 
-	now = ktime_get();
-	/*
-	 * When called from irq_exit we need to account the idle sleep time
-	 * correctly.
-	 */
-	if (ts->tick_stopped) {
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-	}
-
-	ts->idle_entrytime = now;
 	ts->idle_calls++;
-
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&xtime_lock);
@@ -337,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long ticks;
-	ktime_t now, delta;
+	ktime_t now;
 
-	if (!ts->tick_stopped)
+	local_irq_disable();
+	tick_nohz_stop_idle(cpu);
+
+	if (!ts->tick_stopped) {
+		local_irq_enable();
 		return;
+	}
 
 	/* Update jiffies first */
-	now = ktime_get();
-
-	local_irq_disable();
 	select_nohz_load_balancer(0);
+	now = ktime_get();
 	tick_do_update_jiffies64(now);
 	cpu_clear(cpu, nohz_cpu_mask);
 
-	/* Account the idle time */
-	delta = ktime_sub(now, ts->idle_entrytime);
-	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick
-- 
cgit v1.1


From 6e7c402590b75b6b45138792445ee0f0315a8473 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:05 +0100
Subject: x86: various changes and cleanups to in_p/out_p delay details

various changes to the in_p/out_p delay details:

- add the io_delay=none method
- make each method selectable from the kernel config
- simplify the delay code a bit by getting rid of an indirect function call
- add the /proc/sys/kernel/io_delay_type sysctl
- change 'io_delay=standard|alternate' to io_delay=0x80 and io_delay=0xed
- make the io delay config not depend on CONFIG_DEBUG_KERNEL

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: "David P. Reed" <dpreed@reed.com>
---
 kernel/sysctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4bc8e48..357b68b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
 #include <asm/stacktrace.h>
+#include <asm/io.h>
 #endif
 
 static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -727,6 +728,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "io_delay_type",
+		.data		= &io_delay_type,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v1.1


From 36df29d7994180cf7f0ccc5d46495855d56d2721 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:51 +0100
Subject: ptrace: generic resume

This makes ptrace_request handle all the ptrace requests that wake
up the traced task.  These do low-level ptrace implementation magic
that is not arch-specific and should be kept out of arch code.  The
implementations on each arch usually do the same thing.  The new
generic code makes use of the arch_has_single_step macro and generic
entry points to handle PTRACE_SINGLESTEP.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c719bb9..fad5f1e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -366,6 +366,50 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
 	return error;
 }
 
+
+#ifdef PTRACE_SINGLESTEP
+#define is_singlestep(request)		((request) == PTRACE_SINGLESTEP)
+#else
+#define is_singlestep(request)		0
+#endif
+
+#ifdef PTRACE_SYSEMU
+#define is_sysemu_singlestep(request)	((request) == PTRACE_SYSEMU_SINGLESTEP)
+#else
+#define is_sysemu_singlestep(request)	0
+#endif
+
+static int ptrace_resume(struct task_struct *child, long request, long data)
+{
+	if (!valid_signal(data))
+		return -EIO;
+
+	if (request == PTRACE_SYSCALL)
+		set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	else
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+#ifdef TIF_SYSCALL_EMU
+	if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
+		set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+	else
+		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+
+	if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+		if (unlikely(!arch_has_single_step()))
+			return -EIO;
+		user_enable_single_step(child);
+	}
+	else
+		user_disable_single_step(child);
+
+	child->exit_code = data;
+	wake_up_process(child);
+
+	return 0;
+}
+
 int ptrace_request(struct task_struct *child, long request,
 		   long addr, long data)
 {
@@ -390,6 +434,23 @@ int ptrace_request(struct task_struct *child, long request,
 	case PTRACE_DETACH:	 /* detach a process that was attached. */
 		ret = ptrace_detach(child, data);
 		break;
+
+#ifdef PTRACE_SINGLESTEP
+	case PTRACE_SINGLESTEP:
+#endif
+#ifdef PTRACE_SYSEMU
+	case PTRACE_SYSEMU:
+	case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+	case PTRACE_SYSCALL:
+	case PTRACE_CONT:
+		return ptrace_resume(child, request, data);
+
+	case PTRACE_KILL:
+		if (child->exit_state)	/* already dead */
+			return 0;
+		return ptrace_resume(child, request, SIGKILL);
+
 	default:
 		break;
 	}
-- 
cgit v1.1


From 5b88abbf770a0e1975c668743100f42934f385e8 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:53 +0100
Subject: ptrace: generic PTRACE_SINGLEBLOCK

This makes ptrace_request handle PTRACE_SINGLEBLOCK along with
PTRACE_CONT et al.  The new generic code makes use of the
arch_has_block_step macro and generic entry points on machines
that define them.

[ mingo@elte.hu: bugfix ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index fad5f1e..973d727 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -373,6 +373,12 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
 #define is_singlestep(request)		0
 #endif
 
+#ifdef PTRACE_SINGLEBLOCK
+#define is_singleblock(request)		((request) == PTRACE_SINGLEBLOCK)
+#else
+#define is_singleblock(request)		0
+#endif
+
 #ifdef PTRACE_SYSEMU
 #define is_sysemu_singlestep(request)	((request) == PTRACE_SYSEMU_SINGLESTEP)
 #else
@@ -396,7 +402,11 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
 		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 
-	if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+	if (is_singleblock(request)) {
+		if (unlikely(!arch_has_block_step()))
+			return -EIO;
+		user_enable_block_step(child);
+	} else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
 		if (unlikely(!arch_has_single_step()))
 			return -EIO;
 		user_enable_single_step(child);
@@ -438,6 +448,9 @@ int ptrace_request(struct task_struct *child, long request,
 #ifdef PTRACE_SINGLESTEP
 	case PTRACE_SINGLESTEP:
 #endif
+#ifdef PTRACE_SINGLEBLOCK
+	case PTRACE_SINGLEBLOCK:
+#endif
 #ifdef PTRACE_SYSEMU
 	case PTRACE_SYSEMU:
 	case PTRACE_SYSEMU_SINGLESTEP:
-- 
cgit v1.1


From 65ea5b0349903585bfed9720fa06f5edb4f1cd25 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:30:56 +0100
Subject: x86: rename the struct pt_regs members for 32/64-bit consistency

We have a lot of code which differs only by the naming of specific
members of structures that contain registers.  In order to enable
additional unifications, this patch drops the e- or r- size prefix
from the register names in struct pt_regs, and drops the x- prefixes
for segment registers on the 32-bit side.

This patch also performs the equivalent renames in some additional
places that might be candidates for unification in the future.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index afa4f78..bf49ce6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
 		current->comm, task_pid_nr(current), signr);
 
 #if defined(__i386__) && !defined(__arch_um__)
-	printk("code at %08lx: ", regs->eip);
+	printk("code at %08lx: ", regs->ip);
 	{
 		int i;
 		for (i = 0; i < 16; i++) {
 			unsigned char insn;
 
-			__get_user(insn, (unsigned char *)(regs->eip + i));
+			__get_user(insn, (unsigned char *)(regs->ip + i));
 			printk("%02x ", insn);
 		}
 	}
-- 
cgit v1.1


From 95c354fe9f7d6decc08a92aa26eb233ecc2155bf Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: spinlock: lockbreak cleanup

The break_lock data structure and code for spinlocks is quite nasty.
Not only does it double the size of a spinlock but it changes locking to
a potentially less optimal trylock.

Put all of that under CONFIG_GENERIC_LOCKBREAK, and introduce a
__raw_spin_is_contended that uses the lock data itself to determine whether
there are waiters on the lock, to be used if CONFIG_GENERIC_LOCKBREAK is
not set.

Rename need_lockbreak to spin_needbreak, make it use spin_is_contended to
decouple it from the spinlock implementation, and make it typesafe (rwlocks
do not have any need_lockbreak sites -- why do they even get bloated up
with that break_lock then?).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c    | 16 ++++++----------
 kernel/spinlock.c |  3 +--
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 524285e..ba4c880 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4945,19 +4945,15 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int cond_resched_lock(spinlock_t *lock)
 {
+	int resched = need_resched() && system_state == SYSTEM_RUNNING;
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
-		cpu_relax();
-		ret = 1;
-		spin_lock(lock);
-	}
-	if (need_resched() && system_state == SYSTEM_RUNNING) {
-		spin_release(&lock->dep_map, 1, _THIS_IP_);
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
-		__cond_resched();
+		if (resched && need_resched())
+			__cond_resched();
+		else
+			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424..ae28c82 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
  * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
  * not re-enabled during lock-acquire (which the preempt-spin-ops do):
  */
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
-	defined(CONFIG_DEBUG_LOCK_ALLOC)
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
-- 
cgit v1.1


From 16c3e389e7a7254ff8dc7029ac4fbe996c3c75bf Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:47 +0100
Subject: x86: ptrace_request peekdata/pokedata

This makes ptrace_request handle {PEEK,POKE}{TEXT,DATA} directly.
Every arch_ptrace that could call generic_ptrace_peekdata already
has a default case calling ptrace_request, so this keeps things
simpler for the arch code.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 973d727..e6a99d2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -426,6 +426,13 @@ int ptrace_request(struct task_struct *child, long request,
 	int ret = -EIO;
 
 	switch (request) {
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		return generic_ptrace_peekdata(child, addr, data);
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEDATA:
+		return generic_ptrace_pokedata(child, addr, data);
+
 #ifdef PTRACE_OLDSETOPTIONS
 	case PTRACE_OLDSETOPTIONS:
 #endif
-- 
cgit v1.1


From 032d82d9065dec0e26718eca376c2029e4bd0595 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:47 +0100
Subject: x86: compat_ptrace_request

This adds a compat_ptrace_request that is the analogue of ptrace_request
for the things that 32-on-64 ptrace implementations can share in common.
So far there are just a couple of requests handled generically.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6a99d2..ed1c3d5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -607,3 +607,41 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
 	copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+int compat_ptrace_request(struct task_struct *child, compat_long_t request,
+			  compat_ulong_t addr, compat_ulong_t data)
+{
+	compat_ulong_t __user *datap = compat_ptr(data);
+	compat_ulong_t word;
+	int ret;
+
+	switch (request) {
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+		if (ret != sizeof(word))
+			ret = -EIO;
+		else
+			ret = put_user(word, datap);
+		break;
+
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEDATA:
+		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+		ret = (ret != sizeof(data) ? -EIO : 0);
+		break;
+
+	case PTRACE_GETEVENTMSG:
+		ret = put_user((compat_ulong_t) child->ptrace_message, datap);
+		break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif	/* CONFIG_COMPAT */
-- 
cgit v1.1


From c269f19617f508cc5c29c0b064c1a437d7011a46 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:48 +0100
Subject: x86: compat_sys_ptrace

This adds a generic definition of compat_sys_ptrace that calls
compat_arch_ptrace, parallel to sys_ptrace/arch_ptrace.  Some
machines needing this already define a function by that name.
The new generic function is defined only on machines that
put #define __ARCH_WANT_COMPAT_SYS_PTRACE into asm/ptrace.h.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/ptrace.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ed1c3d5..e6e9b8b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -644,4 +644,50 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 
 	return ret;
 }
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
+asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+				  compat_long_t addr, compat_long_t data)
+{
+	struct task_struct *child;
+	long ret;
+
+	/*
+	 * This lock_kernel fixes a subtle race with suid exec
+	 */
+	lock_kernel();
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
+		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		/*
+		 * Some architectures need to do book-keeping after
+		 * a ptrace attach.
+		 */
+		if (!ret)
+			arch_ptrace_attach(child);
+		goto out_put_task_struct;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (!ret)
+		ret = compat_arch_ptrace(child, request, addr, data);
+
+ out_put_task_struct:
+	put_task_struct(child);
+ out:
+	unlock_kernel();
+	return ret;
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
+
 #endif	/* CONFIG_COMPAT */
-- 
cgit v1.1


From 9e094c17ee2b85130ab7b2ea37456f6867eb687a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:48 +0100
Subject: genirq: turn irq debugging options into module params

This allows to change them at runtime using sysfs. No need to
reboot to set them.

I only added aliases (kernel.noirqdebug etc.) so the old options
still work.

Signed-off-by: Andi Kleen <ak@suse.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b1619..a6b2bc8 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#include <linux/moduleparam.h>
 
 static int irqfixup __read_mostly;
 
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
 }
 
 __setup("noirqdebug", noirqdebug_setup);
+module_param(noirqdebug, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
 
 static int __init irqfixup_setup(char *str)
 {
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
 }
 
 __setup("irqfixup", irqfixup_setup);
+module_param(irqfixup, int, 0644);
+MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
 
 static int __init irqpoll_setup(char *str)
 {
-- 
cgit v1.1


From 96d97cf03b3d68e6c857623da93acd522b2b7e1a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:48 +0100
Subject: x86: add /proc/irq/*/spurious to dump the spurious irq debugging
 state

This is useful to debug problems with interrupt handlers that return
sometimes IRQ_NONE.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b9..c2f2ccb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 
 #endif
 
+static int irq_spurious_read(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	struct irq_desc *d = &irq_desc[(long) data];
+	return sprintf(page, "count %u\n"
+			     "unhandled %u\n"
+			     "last_unhandled %u ms\n",
+			d->irq_count,
+			d->irqs_unhandled,
+			jiffies_to_msecs(d->last_unhandled));
+}
+
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq)
 {
 	char name [MAX_NAMELEN];
+	struct proc_dir_entry *entry;
 
 	if (!root_irq_dir ||
 		(irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
 
 #ifdef CONFIG_SMP
 	{
-		struct proc_dir_entry *entry;
-
 		/* create /proc/irq/<irq>/smp_affinity */
 		entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
 
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
 		}
 	}
 #endif
+
+	entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
+	if (entry) {
+		entry->data = (void *)(long)irq;
+		entry->read_proc = irq_spurious_read;
+	}
 }
 
 #undef MAX_NAMELEN
-- 
cgit v1.1


From 79b4cc5ee7a8086ac2c9c0afa52e6d687ce1ffef Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: debug: move WARN_ON() out of line

A quick grep shows that there are currently 1145 instances of WARN_ON
in the kernel. Currently, WARN_ON is pretty much entirely inlined,
which makes it hard to enhance it without growing the size of the kernel
(and getting Andrew unhappy).

This patch build on top of Olof's patch that introduces __WARN,
and places the slowpath out of line. It also uses Ingo's suggestion
to not use __FUNCTION__ but to use kallsyms to do the lookup;
this saves a ton of extra space since gcc doesn't need to store the function
string twice now:

3936367  833603  624736 5394706  525112 vmlinux.before
3917508  833603  624736 5375847  520767 vmlinux-slowpath

15Kb savings...

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Olof Johansson <olof@lixom.net>
Acked-by: Matt Meckall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index da4d6ba..0ebea43 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,6 +20,7 @@
 #include <linux/kexec.h>
 #include <linux/debug_locks.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 
 int panic_on_oops;
 int tainted;
@@ -292,6 +293,20 @@ void oops_exit(void)
 		(unsigned long long)oops_id);
 }
 
+#ifdef WANT_WARN_ON_SLOWPATH
+void warn_on_slowpath(const char *file, int line)
+{
+	char function[KSYM_SYMBOL_LEN];
+	unsigned long caller = (unsigned long) __builtin_return_address(0);
+
+	sprint_symbol(function, caller);
+	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+		line, function);
+	dump_stack();
+}
+EXPORT_SYMBOL(warn_on_slowpath);
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
  * Called when gcc's -fstack-protector feature is used, and
-- 
cgit v1.1


From 71c339116a216b181fc5e203ef51a033fe5e38cf Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: debug: add the end-of-trace marker and the module list to

Unlike oopses, WARN_ON() currently does't print the loaded modules list.
This makes it harder to take action on certain bug reports. For example,
recently there were a set of WARN_ON()s reported in the mac80211 stack,
which were just signalling a driver bug. It takes then anther round trip
to the bug reporter (if he responds at all) to find out which driver
is at fault.

Another issue is that, unlike oopses, WARN_ON() doesn't currently printk
the helpful "cut here" line, nor the "end of trace" marker.
Now that WARN_ON() is out of line, the size increase due to this is
minimal and it's worth adding.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 0ebea43..d9e90cf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -281,6 +281,13 @@ static int init_oops_id(void)
 }
 late_initcall(init_oops_id);
 
+static void print_oops_end_marker(void)
+{
+	init_oops_id();
+	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
+		(unsigned long long)oops_id);
+}
+
 /*
  * Called when the architecture exits its oops handler, after printing
  * everything.
@@ -288,9 +295,7 @@ late_initcall(init_oops_id);
 void oops_exit(void)
 {
 	do_oops_enter_exit();
-	init_oops_id();
-	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
-		(unsigned long long)oops_id);
+	print_oops_end_marker();
 }
 
 #ifdef WANT_WARN_ON_SLOWPATH
@@ -298,11 +303,14 @@ void warn_on_slowpath(const char *file, int line)
 {
 	char function[KSYM_SYMBOL_LEN];
 	unsigned long caller = (unsigned long) __builtin_return_address(0);
-
 	sprint_symbol(function, caller);
+
+	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
+	print_modules();
 	dump_stack();
+	print_oops_end_marker();
 }
 EXPORT_SYMBOL(warn_on_slowpath);
 #endif
-- 
cgit v1.1


From 8c1c9356429741a82ff176d0f3400fb9e06b2a30 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Wed, 30 Jan 2008 13:32:53 +0100
Subject: x86: kprobes: add kprobes smoke tests that run on boot

Here is a quick and naive smoke test for kprobes. This is intended to
just verify if some unrelated change broke the *probes subsystem. It is
self contained, architecture agnostic and isn't of any great use by itself.

This needs to be built in the kernel and runs a basic set of tests to
verify if kprobes, jprobes and kretprobes run fine on the kernel. In case
of an error, it'll print out a message with a "BUG" prefix.

This is a start; we intend to add more tests to this bucket over time.

Thanks to Jim Keniston and Masami Hiramatsu for comments and suggestions.

Tested on x86 (32/64) and powerpc.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile       |   1 +
 kernel/kprobes.c      |   2 +
 kernel/test_kprobes.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 219 insertions(+)
 create mode 100644 kernel/test_kprobes.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 390d421..62015c3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d81..d0493ea 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
 
+	if (!err)
+		init_test_probes();
 	return err;
 }
 
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 0000000..88cdb10
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
+/*
+ * test_kprobes.c - simple sanity test for *probes
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+
+#define div_factor 3
+
+static u32 rand1, preh_val, posth_val, jph_val;
+static int errors, handler_errors, num_tests;
+
+static noinline u32 kprobe_target(u32 value)
+{
+	/*
+	 * gcc ignores noinline on some architectures unless we stuff
+	 * sufficient lard into the function. The get_kprobe() here is
+	 * just for that.
+	 *
+	 * NOTE: We aren't concerned about the correctness of get_kprobe()
+	 * here; hence, this call is neither under !preempt nor with the
+	 * kprobe_mutex held. This is fine(tm)
+	 */
+	if (get_kprobe((void *)0xdeadbeef))
+		printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
+
+	return (value / div_factor);
+}
+
+static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	preh_val = (rand1 / div_factor);
+	return 0;
+}
+
+static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
+		unsigned long flags)
+{
+	if (preh_val != (rand1 / div_factor)) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in post_handler\n");
+	}
+	posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp = {
+	.symbol_name = "kprobe_target",
+	.pre_handler = kp_pre_handler,
+	.post_handler = kp_post_handler
+};
+
+static int test_kprobe(void)
+{
+	int ret;
+
+	ret = register_kprobe(&kp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_kprobe(&kp);
+
+	if (preh_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe pre_handler not called\n");
+		handler_errors++;
+	}
+
+	if (posth_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kprobe post_handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+
+static u32 j_kprobe_target(u32 value)
+{
+	if (value != rand1) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in jprobe handler\n");
+	}
+
+	jph_val = rand1;
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe jp = {
+	.entry		= j_kprobe_target,
+	.kp.symbol_name = "kprobe_target"
+};
+
+static int test_jprobe(void)
+{
+	int ret;
+
+	ret = register_jprobe(&jp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_jprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_jprobe(&jp);
+	if (jph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"jprobe handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static u32 krph_val;
+
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	unsigned long ret = regs_return_value(regs);
+
+	if (ret != (rand1 / div_factor)) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"incorrect value in kretprobe handler\n");
+	}
+
+	krph_val = (rand1 / div_factor);
+	return 0;
+}
+
+static struct kretprobe rp = {
+	.handler	= return_handler,
+	.kp.symbol_name = "kprobe_target"
+};
+
+static int test_kretprobe(void)
+{
+	int ret;
+
+	ret = register_kretprobe(&rp);
+	if (ret < 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"register_kretprobe returned %d\n", ret);
+		return ret;
+	}
+
+	ret = kprobe_target(rand1);
+	unregister_kretprobe(&rp);
+	if (krph_val == 0) {
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"kretprobe handler not called\n");
+		handler_errors++;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+int init_test_probes(void)
+{
+	int ret;
+
+	do {
+		rand1 = random32();
+	} while (rand1 <= div_factor);
+
+	printk(KERN_INFO "Kprobe smoke test started\n");
+	num_tests++;
+	ret = test_kprobe();
+	if (ret < 0)
+		errors++;
+
+	num_tests++;
+	ret = test_jprobe();
+	if (ret < 0)
+		errors++;
+
+#ifdef CONFIG_KRETPROBES
+	num_tests++;
+	ret = test_kretprobe();
+	if (ret < 0)
+		errors++;
+#endif /* CONFIG_KRETPROBES */
+
+	if (errors)
+		printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+				"%d tests failed\n", errors, num_tests);
+	else if (handler_errors)
+		printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+				"running handlers\n", handler_errors);
+	else
+		printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+
+	return 0;
+}
-- 
cgit v1.1


From 076f9776f5d8d131b36955db8641aba3893c2c1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:06 +0100
Subject: x86: make early printk selectable on 64-bit as well

Enable CONFIG_EMBEDDED to select CONFIG_EARLY_PRINTK on 64-bit as well.

saves ~2K:

   text    data     bss     dec     hex filename
   7290283 3672091 1907848 12870222         c4624e vmlinux.before
   7288373 3671795 1907848 12868016         c459b0 vmlinux.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968..58bbec6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Architectures can override it:
+ */
+void __attribute__((weak)) early_printk(const char *fmt, ...)
+{
+}
+
 #define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT)
 
 /* printk's without a loglevel use this.. */
-- 
cgit v1.1


From 6dab27784b2a97823b522e1cb88e40be40a93d45 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:33:08 +0100
Subject: x86: add a simple backtrace test module

During the work on the x86 32 and 64 bit backtrace code I found it useful
to have a simple test module to test a process and irq context backtrace.
Since the existing backtrace code was buggy, I figure it might be useful
to have such a test module in the kernel so that maybe we can even
detect such bugs earlier..

[ mingo@elte.hu: build fix ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile        |  1 +
 kernel/backtracetest.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 kernel/backtracetest.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 62015c3..8885627 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 0000000..d1a7605
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
+/*
+ * Simple stack backtrace regression test module
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+static struct timer_list backtrace_timer;
+
+static void backtrace_test_timer(unsigned long data)
+{
+	printk("Testing a backtrace from irq context.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+	dump_stack();
+}
+static int backtrace_regression_test(void)
+{
+	printk("====[ backtrace testing ]===========\n");
+	printk("Testing a backtrace from process context.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+	dump_stack();
+
+	init_timer(&backtrace_timer);
+	backtrace_timer.function = backtrace_test_timer;
+	mod_timer(&backtrace_timer, jiffies + 10);
+
+	msleep(10);
+	printk("====[ end of backtrace testing ]====\n");
+	return 0;
+}
+
+static void exitf(void)
+{
+}
+
+module_init(backtrace_regression_test);
+module_exit(exitf);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
-- 
cgit v1.1


From 70edcd77a0d6d0f8731c826764f5eb6732f521e9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: genirq: stackdump after the "Trying to free already-free IRQ" message

these bugs are harder to find than they seem, a stackdump helps.

make it dependent on CONFIG_DEBUG_SHIRQ so that people can turn it off
if it annoys them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f31422..438a014 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
 			return;
 		}
 		printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
+#ifdef CONFIG_DEBUG_SHIRQ
+		dump_stack();
+#endif
 		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
-- 
cgit v1.1


From dd5af90a7f3d79e04b7eace9a98644dbf2038f4d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:32 +0100
Subject: x86/non-x86: percpu, node ids, apic ids x86.git fixup

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/module.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f6a4e72..bd60278e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -430,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
 
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
-- 
cgit v1.1


From 6d4e4c4fca5be806b888d606894d914847e82d78 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 21 Nov 2007 16:41:05 +0200
Subject: KVM: Disallow fork() and similar games when using a VM

We don't want the meaning of guest userspace changing under our feet.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 314f510..05e0b6f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
 	destroy_context(mm);
 	free_mm(mm);
 }
+EXPORT_SYMBOL_GPL(__mmdrop);
 
 /*
  * Decrement the use count and release all resources for an mm.
-- 
cgit v1.1