From e213e87785559eaf3107897226817aea9291b06f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 15 Aug 2008 18:12:47 +0200
Subject: x86: Fix ioremap off by one BUG

Jean Delvare's machine triggered this BUG

acpi_os_map_memory phys ffff0000 size 65535
------------[ cut here ]------------
kernel BUG at arch/x86/mm/pat.c:233!

with ACPI in the backtrace.

Adding some debugging output showed that ACPI calls

acpi_os_map_memory phys ffff0000 size 65535

And ioremap/PAT does this check in 32bit, so addr+size wraps and the BUG
in reserve_memtype() triggers incorrectly.

        BUG_ON(start >= end); /* end is exclusive */

But reserve_memtype already uses u64:

int reserve_memtype(u64 start, u64 end,

so the 32bit truncation must happen in the caller. Presumably in ioremap
when it passes this information to reserve_memtype().

This patch does this computation in 64bit.

http://bugzilla.kernel.org/show_bug.cgi?id=11346

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/mm/ioremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 016f335..6ba6f88 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	phys_addr &= PAGE_MASK;
 	size = PAGE_ALIGN(last_addr+1) - phys_addr;
 
-	retval = reserve_memtype(phys_addr, phys_addr + size,
+	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
 						prot_val, &new_prot_val);
 	if (retval) {
 		pr_debug("Warning: reserve_memtype returned %d\n", retval);
-- 
cgit v1.1


From a2bd92023357e47f22a34d4cb1635453546662bc Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jul 2008 19:21:42 -0700
Subject: cpuidle: Do not use poll_idle unless user asks for it

poll_idle was added to CPUIDLE, just as a low latency idle handler, to be
used in cases when user desires CPUs not to enter any idle state at all. It
was supposed to be a run time idle=poll option to the user. But, it was indeed
getting used during normal menu and ladder governor default case, with no
special user setting (Reported by Linus Torvalds).

Change below ensures that poll_idle will not be used unless user explicitly
asks pm_qos infrastructure for zero latency requirement.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/cpuidle/governors/ladder.c | 14 ++++++++++----
 drivers/cpuidle/governors/menu.c   | 11 +++++++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index ba7b9a6..27ab3bf 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -67,10 +67,17 @@ static int ladder_select_state(struct cpuidle_device *dev)
 	struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
 	struct ladder_device_state *last_state;
 	int last_residency, last_idx = ldev->last_state_idx;
+	int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
 
 	if (unlikely(!ldev))
 		return 0;
 
+	/* Special case when user has set very strict latency requirement */
+	if (unlikely(latency_req == 0)) {
+		ladder_do_selection(ldev, last_idx, 0);
+		return 0;
+	}
+
 	last_state = &ldev->states[last_idx];
 
 	if (dev->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID)
@@ -81,8 +88,7 @@ static int ladder_select_state(struct cpuidle_device *dev)
 	/* consider promotion */
 	if (last_idx < dev->state_count - 1 &&
 	    last_residency > last_state->threshold.promotion_time &&
-	    dev->states[last_idx + 1].exit_latency <=
-			pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
+	    dev->states[last_idx + 1].exit_latency <= latency_req) {
 		last_state->stats.promotion_count++;
 		last_state->stats.demotion_count = 0;
 		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
@@ -92,7 +98,7 @@ static int ladder_select_state(struct cpuidle_device *dev)
 	}
 
 	/* consider demotion */
-	if (last_idx > 0 &&
+	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
 	    last_residency < last_state->threshold.demotion_time) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
@@ -117,7 +123,7 @@ static int ladder_enable_device(struct cpuidle_device *dev)
 	struct ladder_device_state *lstate;
 	struct cpuidle_state *state;
 
-	ldev->last_state_idx = 0;
+	ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
 	for (i = 0; i < dev->state_count; i++) {
 		state = &dev->states[i];
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 78d77c5..b8f3e21 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -34,21 +34,28 @@ static DEFINE_PER_CPU(struct menu_device, menu_devices);
 static int menu_select(struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
+	int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
 	int i;
 
+	/* Special case when user has set very strict latency requirement */
+	if (unlikely(latency_req == 0)) {
+		data->last_state_idx = 0;
+		return 0;
+	}
+
 	/* determine the expected residency time */
 	data->expected_us =
 		(u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000;
 
 	/* find the deepest idle state that satisfies our constraints */
-	for (i = 1; i < dev->state_count; i++) {
+	for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
 
 		if (s->target_residency > data->expected_us)
 			break;
 		if (s->target_residency > data->predicted_us)
 			break;
-		if (s->exit_latency > pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY))
+		if (s->exit_latency > latency_req)
 			break;
 	}
 
-- 
cgit v1.1


From 320eee776357db52d6fcfb11cff985b1976a4595 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jul 2008 19:21:43 -0700
Subject: cpuidle: Menu governor fix wrong usage of measured_us

There is a bug in menu governor where we have
		if (data->elapsed_us < data->elapsed_us + measured_us)

with measured_us already having elapsed_us added in tickless case here
	unsigned int measured_us =
		cpuidle_get_last_residency(dev) + data->elapsed_us;

Also, it should be last_residency, not measured_us, that need to be used to
do comparing and distinguish between expected & non-expected events.

Refactor menu_reflect() to fix these two problems.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Wei Gang <gang.wei@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/cpuidle/governors/menu.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b8f3e21..8d7cf3f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -74,9 +74,9 @@ static void menu_reflect(struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int last_idx = data->last_state_idx;
-	unsigned int measured_us =
-		cpuidle_get_last_residency(dev) + data->elapsed_us;
+	unsigned int last_idle_us = cpuidle_get_last_residency(dev);
 	struct cpuidle_state *target = &dev->states[last_idx];
+	unsigned int measured_us;
 
 	/*
 	 * Ugh, this idle state doesn't support residency measurements, so we
@@ -84,20 +84,27 @@ static void menu_reflect(struct cpuidle_device *dev)
 	 * for one full standard timer tick.  However, be aware that this
 	 * could potentially result in a suboptimal state transition.
 	 */
-	if (!(target->flags & CPUIDLE_FLAG_TIME_VALID))
-		measured_us = USEC_PER_SEC / HZ;
+	if (unlikely(!(target->flags & CPUIDLE_FLAG_TIME_VALID)))
+		last_idle_us = USEC_PER_SEC / HZ;
 
-	/* Predict time remaining until next break event */
-	if (measured_us + BREAK_FUZZ < data->expected_us - target->exit_latency) {
-		data->predicted_us = max(measured_us, data->last_measured_us);
+	/*
+	 * measured_us and elapsed_us are the cumulative idle time, since the
+	 * last time we were woken out of idle by an interrupt.
+	 */
+	if (data->elapsed_us <= data->elapsed_us + last_idle_us)
+		measured_us = data->elapsed_us + last_idle_us;
+	else
+		measured_us = -1;
+
+	/* Predict time until next break event */
+	data->predicted_us = max(measured_us, data->last_measured_us);
+
+	if (last_idle_us + BREAK_FUZZ <
+	    data->expected_us - target->exit_latency) {
 		data->last_measured_us = measured_us;
 		data->elapsed_us = 0;
 	} else {
-		if (data->elapsed_us < data->elapsed_us + measured_us)
-			data->elapsed_us = measured_us;
-		else
-			data->elapsed_us = -1;
-		data->predicted_us = max(measured_us, data->last_measured_us);
+		data->elapsed_us = measured_us;
 	}
 }
 
-- 
cgit v1.1


From 06d9e908b2248f983b186aaf569c58e1430db85d Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jul 2008 19:21:44 -0700
Subject: cpuidle: Make ladder governor honor latency requirements fully

ladder governor only honored latency requirement when promoting C-states.
Instead. it should check for latency requirement on each idle call,
and demote to appropriate C-state when there is a latency requirement change.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/cpuidle/governors/ladder.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 27ab3bf..a4bec3f 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -99,6 +99,18 @@ static int ladder_select_state(struct cpuidle_device *dev)
 
 	/* consider demotion */
 	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
+	    dev->states[last_idx].exit_latency > latency_req) {
+		int i;
+
+		for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
+			if (dev->states[i].exit_latency <= latency_req)
+				break;
+		}
+		ladder_do_selection(ldev, last_idx, i);
+		return i;
+	}
+
+	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
 	    last_residency < last_state->threshold.demotion_time) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
-- 
cgit v1.1