From 39fe05e58c5e448601ce46e6b03900d5bf31c4b0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 12 Aug 2009 11:16:12 +0800
Subject: x86, hpet: Disable per-cpu hpet timer if ARAT is supported

If CPU support always running local APIC timer, per-cpu hpet
timer could be disabled, which is useless and wasteful in such
case. Let's leave the timers to others.

The effect is that we reserve less timers.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: venkatesh.pallipadi@intel.com
LKML-Reference: <20090812031612.GA10062@sli10-desk.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bd..5969e10 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -584,6 +584,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int num_timers_used = 0;
 	int i;
 
+	if (boot_cpu_has(X86_FEATURE_ARAT))
+		return;
 	id = hpet_readl(HPET_ID);
 
 	num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -872,10 +874,8 @@ int __init hpet_enable(void)
 
 	if (id & HPET_ID_LEGSUP) {
 		hpet_legacy_clockevent_register();
-		hpet_msi_capability_lookup(2);
 		return 1;
 	}
-	hpet_msi_capability_lookup(0);
 	return 0;
 
 out_nohpet:
@@ -908,9 +908,17 @@ static __init int hpet_late_init(void)
 	if (!hpet_virt_address)
 		return -ENODEV;
 
+	if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
+		hpet_msi_capability_lookup(2);
+	else
+		hpet_msi_capability_lookup(0);
+
 	hpet_reserve_platform_timers(hpet_readl(HPET_ID));
 	hpet_print_config();
 
+	if (boot_cpu_has(X86_FEATURE_ARAT))
+		return 0;
+
 	for_each_online_cpu(cpu) {
 		hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
 	}
-- 
cgit v1.1


From 5946fa3d5cdeb846a647a1900026af9f8b08c8b5 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 19 Aug 2009 08:44:24 +0100
Subject: x86, hpet: Simplify the HPET code

On 64-bits, using unsigned long when unsigned int suffices
needlessly creates larger code (due to the need for REX
prefixes), and most of the logic in hpet.c really doesn't need
64-bit operations.

At once this avoids the need for a couple of type casts.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <4A8BC9780200007800010832@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 5969e10..ba575f0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -47,12 +47,12 @@ struct hpet_dev {
 	char				name[10];
 };
 
-unsigned long hpet_readl(unsigned long a)
+inline unsigned int hpet_readl(unsigned int a)
 {
 	return readl(hpet_virt_address + a);
 }
 
-static inline void hpet_writel(unsigned long d, unsigned long a)
+static inline void hpet_writel(unsigned int d, unsigned int a)
 {
 	writel(d, hpet_virt_address + a);
 }
@@ -167,7 +167,7 @@ do {								\
 
 static void hpet_reserve_msi_timers(struct hpet_data *hd);
 
-static void hpet_reserve_platform_timers(unsigned long id)
+static void hpet_reserve_platform_timers(unsigned int id)
 {
 	struct hpet __iomem *hpet = hpet_virt_address;
 	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +205,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
 
 }
 #else
-static void hpet_reserve_platform_timers(unsigned long id) { }
+static void hpet_reserve_platform_timers(unsigned int id) { }
 #endif
 
 /*
@@ -246,7 +246,7 @@ static void hpet_reset_counter(void)
 
 static void hpet_start_counter(void)
 {
-	unsigned long cfg = hpet_readl(HPET_CFG);
+	unsigned int cfg = hpet_readl(HPET_CFG);
 	cfg |= HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
 }
@@ -271,7 +271,7 @@ static void hpet_resume_counter(void)
 
 static void hpet_enable_legacy_int(void)
 {
-	unsigned long cfg = hpet_readl(HPET_CFG);
+	unsigned int cfg = hpet_readl(HPET_CFG);
 
 	cfg |= HPET_CFG_LEGACY;
 	hpet_writel(cfg, HPET_CFG);
@@ -314,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
-	unsigned long cfg, cmp, now;
+	unsigned int cfg, cmp, now;
 	uint64_t delta;
 
 	switch (mode) {
@@ -323,7 +323,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
 		delta >>= evt->shift;
 		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned long) delta;
+		cmp = now + (unsigned int) delta;
 		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		/* Make sure we use edge triggered interrupts */
 		cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +339,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
 		 * Publication # 24674)
 		 */
-		hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
+		hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
 		hpet_start_counter();
 		hpet_print_config();
 		break;
@@ -387,9 +387,9 @@ static int hpet_next_event(unsigned long delta,
 	 * what we wrote hit the chip before we compare it to the
 	 * counter.
 	 */
-	WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt);
+	WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt);
 
-	return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+	return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
 }
 
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +415,7 @@ static struct hpet_dev	*hpet_devs;
 void hpet_msi_unmask(unsigned int irq)
 {
 	struct hpet_dev *hdev = get_irq_data(irq);
-	unsigned long cfg;
+	unsigned int cfg;
 
 	/* unmask it */
 	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +425,7 @@ void hpet_msi_unmask(unsigned int irq)
 
 void hpet_msi_mask(unsigned int irq)
 {
-	unsigned long cfg;
+	unsigned int cfg;
 	struct hpet_dev *hdev = get_irq_data(irq);
 
 	/* mask it */
@@ -600,7 +600,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 
 	for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
 		struct hpet_dev *hdev = &hpet_devs[num_timers_used];
-		unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+		unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
 
 		/* Only consider HPET timer with MSI support */
 		if (!(cfg & HPET_TN_FSB_CAP))
@@ -815,7 +815,7 @@ static int hpet_clocksource_register(void)
  */
 int __init hpet_enable(void)
 {
-	unsigned long id;
+	unsigned int id;
 	int i;
 
 	if (!is_hpet_capable())
@@ -933,7 +933,7 @@ fs_initcall(hpet_late_init);
 void hpet_disable(void)
 {
 	if (is_hpet_capable()) {
-		unsigned long cfg = hpet_readl(HPET_CFG);
+		unsigned int cfg = hpet_readl(HPET_CFG);
 
 		if (hpet_legacy_int_enabled) {
 			cfg &= ~HPET_CFG_LEGACY;
@@ -973,8 +973,8 @@ static int hpet_prev_update_sec;
 static struct rtc_time hpet_alarm_time;
 static unsigned long hpet_pie_count;
 static u32 hpet_t1_cmp;
-static unsigned long hpet_default_delta;
-static unsigned long hpet_pie_delta;
+static u32 hpet_default_delta;
+static u32 hpet_pie_delta;
 static unsigned long hpet_pie_limit;
 
 static rtc_irq_handler irq_handler;
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
  */
 int hpet_rtc_timer_init(void)
 {
-	unsigned long cfg, cnt, delta, flags;
+	unsigned int cfg, cnt, delta;
+	unsigned long flags;
 
 	if (!is_hpet_enabled())
 		return 0;
@@ -1035,7 +1036,7 @@ int hpet_rtc_timer_init(void)
 
 		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
 		clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
-		hpet_default_delta = (unsigned long) clc;
+		hpet_default_delta = clc;
 	}
 
 	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1121,7 +1122,7 @@ int hpet_set_periodic_freq(unsigned long freq)
 		clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
 		do_div(clc, freq);
 		clc >>= hpet_clockevent.shift;
-		hpet_pie_delta = (unsigned long) clc;
+		hpet_pie_delta = clc;
 	}
 	return 1;
 }
@@ -1135,7 +1136,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
 
 static void hpet_rtc_timer_reinit(void)
 {
-	unsigned long cfg, delta;
+	unsigned int cfg, delta;
 	int lost_ints = -1;
 
 	if (unlikely(!hpet_rtc_flags)) {
-- 
cgit v1.1


From c8bc6f3c806f1fcbfdbf0b1ff6c52dba59192d3b Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 4 Aug 2009 12:07:09 -0700
Subject: x86: arch specific support for remapping HPET MSIs

x86 arch support for remapping HPET MSI's by associating the HPET timer block
with the interrupt-remapping HW unit and setting up appropriate irq_chip

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Jay Fenlason <fenlason@redhat.com>
LKML-Reference: <20090804190729.630510000@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/boot.c    |  1 +
 arch/x86/kernel/apic/io_apic.c | 49 +++++++++++++++++++++++++++++++++++-------
 arch/x86/kernel/hpet.c         |  3 ++-
 3 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6b8ca3a..eae642b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
 	}
 
 	hpet_address = hpet_tbl->address.address;
+	hpet_blockid = hpet_tbl->sequence;
 
 	/*
 	 * Some broken BIOSes advertise HPET at 0x0. We really do not
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5..d9c6f14 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3254,7 +3254,8 @@ void destroy_irq(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+			   struct msi_msg *msg, u8 hpet_id)
 {
 	struct irq_cfg *cfg;
 	int err;
@@ -3288,7 +3289,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		irte.dest_id = IRTE_DEST(dest);
 
 		/* Set source-id of interrupt request */
-		set_msi_sid(&irte, pdev);
+		if (pdev)
+			set_msi_sid(&irte, pdev);
+		else
+			set_hpet_sid(&irte, hpet_id);
 
 		modify_irte(irq, &irte);
 
@@ -3453,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	int ret;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(dev, irq, &msg);
+	ret = msi_compose_msg(dev, irq, &msg, -1);
 	if (ret < 0)
 		return ret;
 
@@ -3586,7 +3590,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 	int ret;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
+	ret = msi_compose_msg(NULL, irq, &msg, -1);
 	if (ret < 0)
 		return ret;
 	dmar_msi_write(irq, &msg);
@@ -3626,6 +3630,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 #endif /* CONFIG_SMP */
 
+static struct irq_chip ir_hpet_msi_type = {
+	.name = "IR-HPET_MSI",
+	.unmask = hpet_msi_unmask,
+	.mask = hpet_msi_mask,
+#ifdef CONFIG_INTR_REMAP
+	.ack = ir_ack_apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity = ir_set_msi_irq_affinity,
+#endif
+#endif
+	.retrigger = ioapic_retrigger_irq,
+};
+
 static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.unmask = hpet_msi_unmask,
@@ -3637,20 +3654,36 @@ static struct irq_chip hpet_msi_type = {
 	.retrigger = ioapic_retrigger_irq,
 };
 
-int arch_setup_hpet_msi(unsigned int irq)
+int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
 	int ret;
 	struct msi_msg msg;
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	ret = msi_compose_msg(NULL, irq, &msg);
+	if (intr_remapping_enabled) {
+		struct intel_iommu *iommu = map_hpet_to_ir(id);
+		int index;
+
+		if (!iommu)
+			return -1;
+
+		index = alloc_irte(iommu, irq, 1);
+		if (index < 0)
+			return -1;
+	}
+
+	ret = msi_compose_msg(NULL, irq, &msg, id);
 	if (ret < 0)
 		return ret;
 
 	hpet_msi_write(irq, &msg);
 	desc->status |= IRQ_MOVE_PCNTXT;
-	set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
-		"edge");
+	if (irq_remapped(irq))
+		set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
+					      handle_edge_irq, "edge");
+	else
+		set_irq_chip_and_handler_name(irq, &hpet_msi_type,
+					      handle_edge_irq, "edge");
 
 	return 0;
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba575f0..7f024ff 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,7 @@
  * HPET address is set in acpi/boot.c, when an ACPI entry exists
  */
 unsigned long				hpet_address;
+u8					hpet_blockid; /* OS timer block num */
 #ifdef CONFIG_PCI_MSI
 static unsigned long			hpet_num_timers;
 #endif
@@ -467,7 +468,7 @@ static int hpet_msi_next_event(unsigned long delta,
 
 static int hpet_setup_msi_irq(unsigned int irq)
 {
-	if (arch_setup_hpet_msi(irq)) {
+	if (arch_setup_hpet_msi(irq, hpet_blockid)) {
 		destroy_irq(irq);
 		return -EINVAL;
 	}
-- 
cgit v1.1


From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 19 Sep 2009 09:40:22 +0300
Subject: core, x86: Add user return notifiers

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

This will be used initially to speed up KVM task switching by lazily
updating MSRs.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/process.c | 2 ++
 arch/x86/kernel/signal.c  | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..e51b056 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76..c49f90f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		if (current->replacement_session_keyring)
 			key_replace_session_keyring();
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
-- 
cgit v1.1


From a6f05a6a0a1713d5b019f096799d49226807d3df Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 8 Oct 2009 18:02:54 -0700
Subject: x86-64: make compat_start_thread() match start_thread()

For no real good reason, compat_start_thread() was embedded inline in
<asm/elf.h> whereas the native start_thread() lives in process_*.c.
Move compat_start_thread() to process_64.c, remove gratuitious
differences, and fix a few items which mostly look like bit rot.

In particular, compat_start_thread() didn't do free_thread_xstate(),
which means it was hanging on to the xstate store area even when it
was not needed.  It was also not setting old_rsp, but it looks like
that generally shouldn't matter for a 32-bit process.

Note: compat_start_thread *has* to be a macro, since it is tested with
start_thread_ia32() as the out of line function name.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/kernel/process_64.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b6..7cf0a6b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -356,7 +356,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	percpu_write(old_rsp, new_sp);
 	regs->cs		= __USER_CS;
 	regs->ss		= __USER_DS;
-	regs->flags		= 0x200;
+	regs->flags		= X86_EFLAGS_IF;
 	set_fs(USER_DS);
 	/*
 	 * Free the old FP and other extended state
@@ -365,6 +365,27 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
+#ifdef CONFIG_IA32_EMULATION
+void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+	loadsegment(fs, 0);
+	loadsegment(ds, __USER32_DS);
+	loadsegment(es, __USER32_DS);
+	load_gs_index(0);
+	regs->ip		= new_ip;
+	regs->sp		= new_sp;
+	percpu_write(old_rsp, new_sp);
+	regs->cs		= __USER32_CS;
+	regs->ss		= __USER32_DS;
+	regs->flags		= X86_EFLAGS_IF;
+	set_fs(USER_DS);
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
+}
+#endif
+
 /*
  *	switch_to(x,y) should switch tasks from x to y.
  *
-- 
cgit v1.1


From e634d8fc792c66c3d4ff45518c04848c1e28f221 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 9 Oct 2009 15:56:53 -0700
Subject: x86-64: merge the standard and compat start_thread() functions

The only thing left that differs between the standard and compat
start_thread functions is the actual segment numbers and the
prototype, so have a single common function which contains the guts
and two very small wrappers.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/kernel/process_64.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7cf0a6b..eb261c5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -344,18 +344,20 @@ out:
 	return err;
 }
 
-void
-start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+		    unsigned long new_sp,
+		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 {
 	loadsegment(fs, 0);
-	loadsegment(es, 0);
-	loadsegment(ds, 0);
+	loadsegment(es, _ds);
+	loadsegment(ds, _ds);
 	load_gs_index(0);
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	percpu_write(old_rsp, new_sp);
-	regs->cs		= __USER_CS;
-	regs->ss		= __USER_DS;
+	regs->cs		= _cs;
+	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
 	set_fs(USER_DS);
 	/*
@@ -363,26 +365,19 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	 */
 	free_thread_xstate(current);
 }
-EXPORT_SYMBOL_GPL(start_thread);
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+	start_thread_common(regs, new_ip, new_sp,
+			    __USER_CS, __USER_DS, 0);
+}
 
 #ifdef CONFIG_IA32_EMULATION
 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
-	loadsegment(fs, 0);
-	loadsegment(ds, __USER32_DS);
-	loadsegment(es, __USER32_DS);
-	load_gs_index(0);
-	regs->ip		= new_ip;
-	regs->sp		= new_sp;
-	percpu_write(old_rsp, new_sp);
-	regs->cs		= __USER32_CS;
-	regs->ss		= __USER32_DS;
-	regs->flags		= X86_EFLAGS_IF;
-	set_fs(USER_DS);
-	/*
-	 * Free the old FP and other extended state
-	 */
-	free_thread_xstate(current);
+	start_thread_common(regs, new_ip, new_sp,
+			    __USER32_CS, __USER32_DS, __USER32_DS);
 }
 #endif
 
-- 
cgit v1.1


From 8ee2debce32412118cf8c239e0026ace56ea1425 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 25 Sep 2009 15:20:00 -0700
Subject: x86: Export k8 physical topology

To eventually interleave emulated nodes over physical nodes, we
need to know the physical topology of the machine without actually
registering it.  This does the k8 node setup in two parts:
detection and registration.  NUMA emulation can then used the
physical topology detected to setup the address ranges of emulated
nodes accordingly.  If emulation isn't used, the k8 nodes are
registered as normal.

Two formals are added to the x86 NUMA setup functions: `acpi' and
`k8'. These represent whether ACPI or K8 NUMA has been detected;
both cannot be true at the same time.  This specifies to the NUMA
emulation code whether an underlying physical NUMA topology exists
and which interface to use.

This patch deals solely with separating the k8 setup path into
Northbridge detection and registration steps and leaves the ACPI
changes for a subsequent patch.  The `acpi' formal is added here,
however, to avoid touching all the header files again in the next
patch.

This approach also ensures emulated nodes will not span physical
nodes so the true memory latency is not misrepresented.

k8_get_nodes() may now be used to export the k8 physical topology
of the machine for NUMA emulation.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251518400.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2..fda0032 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,6 +106,7 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
+#include <asm/k8.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
@@ -691,6 +692,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 
 void __init setup_arch(char **cmdline_p)
 {
+	int acpi = 0;
+	int k8 = 0;
+
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
@@ -937,7 +941,11 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-	initmem_init(0, max_pfn);
+#ifdef CONFIG_K8_NUMA
+	k8 = !k8_numa_init(0, max_pfn);
+#endif
+
+	initmem_init(0, max_pfn, acpi, k8);
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
-- 
cgit v1.1


From 8716273caef7f55f39fe4fc6c69c5f9f197f41f1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 25 Sep 2009 15:20:04 -0700
Subject: x86: Export srat physical topology

This is the counterpart to "x86: export k8 physical topology" for
SRAT. It is not as invasive because the acpi code already seperates
node setup into detection and registration steps, with the
exception of registering e820 active regions in
acpi_numa_memory_affinity_init().  This is now moved to
acpi_scan_nodes() if NUMA emulation is disabled or deferred.

acpi_numa_init() now returns a value which specifies whether an
underlying SRAT was located.  If so, that topology can be used by
the emulation code to interleave emulated nodes over physical nodes
or to register the nodes for ACPI.

acpi_get_nodes() may now be used to export the srat physical
topology of the machine for NUMA emulation.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251518580.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fda0032..f891419 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -938,11 +938,12 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
-	acpi_numa_init();
+	acpi = acpi_numa_init();
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	k8 = !k8_numa_init(0, max_pfn);
+	if (!acpi)
+		k8 = !k8_numa_init(0, max_pfn);
 #endif
 
 	initmem_init(0, max_pfn, acpi, k8);
-- 
cgit v1.1


From a2e2725541fad72416326798c2d7fa4dafb7d337 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 12 Oct 2009 23:40:10 -0700
Subject: net: Introduce recvmmsg socket syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.

Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.

This takes into account comments made by:

. Paul Moore: sock_recvmsg is called only for the first datagram,
  sock_recvmsg_nosec is used for the rest.

. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
  works in the same fashion as the ppoll one.

  If the underlying protocol returns a datagram with MSG_OOB set, this
  will make recvmmsg return right away with as many datagrams (+ the OOB
  one) it has received so far.

. Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
  datagrams and then recvmsg returns an error, recvmmsg will return
  the successfully received datagrams, store the error and return it
  in the next call.

This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg,
where we will be able to acquire the lock only at batch start and end, not at
every underlying recvmsg call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/kernel/syscall_table_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd2..70c2125 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
+	.long sys_recvmmsg
-- 
cgit v1.1


From e47938b1faaf9e9041ae842a878901001ce20ea1 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 14 Oct 2009 09:16:30 -0500
Subject: x86: UV RTC: Fix early expiry handling

Tune/fix early timer expiry handling and return correct early timeout value
for set_next_event.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20091014141630.GB11048@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_time.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d..ec14889 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
 	/* Initialize comparator value */
 	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
 
-	return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode));
+	if (uv_read_rtc(NULL) <= expires)
+		return 0;
+
+	return !uv_intr_pending(pnode);
 }
 
 /*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
 
 	next_cpu = head->next_cpu;
 	*t = expires;
+
 	/* Will this one be next to go off? */
 	if (next_cpu < 0 || bcpu == next_cpu ||
 			expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
 			*t = ULLONG_MAX;
 			uv_rtc_find_next_timer(head, pnode);
 			spin_unlock_irqrestore(&head->lock, flags);
-			return 1;
+			return -ETIME;
 		}
 	}
 
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
  *
  * Returns 1 if this timer was pending.
  */
-static int uv_rtc_unset_timer(int cpu)
+static int uv_rtc_unset_timer(int cpu, int force)
 {
 	int pnode = uv_cpu_to_pnode(cpu);
 	int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
 
 	spin_lock_irqsave(&head->lock, flags);
 
-	if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t)
+	if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
 		rc = 1;
 
-	*t = ULLONG_MAX;
-
-	/* Was the hardware setup for this timer? */
-	if (head->next_cpu == bcpu)
-		uv_rtc_find_next_timer(head, pnode);
+	if (rc) {
+		*t = ULLONG_MAX;
+		/* Was the hardware setup for this timer? */
+		if (head->next_cpu == bcpu)
+			uv_rtc_find_next_timer(head, pnode);
+	}
 
 	spin_unlock_irqrestore(&head->lock, flags);
 
@@ -310,20 +315,20 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
 		break;
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
-		uv_rtc_unset_timer(ced_cpu);
+		uv_rtc_unset_timer(ced_cpu, 1);
 		break;
 	}
 }
 
 static void uv_rtc_interrupt(void)
 {
-	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
 	int cpu = smp_processor_id();
+	struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
 
 	if (!ced || !ced->event_handler)
 		return;
 
-	if (uv_rtc_unset_timer(cpu) != 1)
+	if (uv_rtc_unset_timer(cpu, 0) != 1)
 		return;
 
 	ced->event_handler(ced);
-- 
cgit v1.1


From 8c28de4d011f37b2893ecfcec9a985c0e9bd786f Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 14 Oct 2009 09:18:48 -0500
Subject: x86: UV RTC: Add clocksource only boot option

Add clocksource only boot option for UV RTC.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20091014141848.GC11048@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_time.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index ec14889..c6324ad 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -75,6 +75,7 @@ struct uv_rtc_timer_head {
 static struct uv_rtc_timer_head		**blade_info __read_mostly;
 
 static int				uv_rtc_enable;
+static int				uv_rtc_evt_enable;
 
 /*
  * Hardware interface routines
@@ -342,6 +343,14 @@ static int __init uv_enable_rtc(char *str)
 }
 __setup("uvrtc", uv_enable_rtc);
 
+static int __init uv_enable_evt_rtc(char *str)
+{
+	uv_rtc_evt_enable = 1;
+
+	return 1;
+}
+__setup("uvrtcevt", uv_enable_evt_rtc);
+
 static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
 {
 	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
@@ -358,16 +367,20 @@ static __init int uv_rtc_setup_clock(void)
 	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
 		return -ENODEV;
 
-	generic_interrupt_extension = uv_rtc_interrupt;
-
 	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
 				clocksource_uv.shift);
 
 	rc = clocksource_register(&clocksource_uv);
-	if (rc) {
-		generic_interrupt_extension = NULL;
+	if (rc)
+		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
+	else
+		printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
+			sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+	if (rc || !uv_rtc_evt_enable)
 		return rc;
-	}
+
+	generic_interrupt_extension = uv_rtc_interrupt;
 
 	/* Setup and register clockevents */
 	rc = uv_rtc_allocate_timers();
-- 
cgit v1.1


From d5991ff297ad2f7e2698eefcd8269df5ecec150f Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 14 Oct 2009 09:21:03 -0500
Subject: x86: UV RTC: Clean up error handling

Cleanup error handling in uv_rtc_setup_clock.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20091014142103.GD11048@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_time.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index c6324ad..2556450 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -380,15 +380,12 @@ static __init int uv_rtc_setup_clock(void)
 	if (rc || !uv_rtc_evt_enable)
 		return rc;
 
-	generic_interrupt_extension = uv_rtc_interrupt;
-
 	/* Setup and register clockevents */
 	rc = uv_rtc_allocate_timers();
-	if (rc) {
-		clocksource_unregister(&clocksource_uv);
-		generic_interrupt_extension = NULL;
-		return rc;
-	}
+	if (rc)
+		goto error;
+
+	generic_interrupt_extension = uv_rtc_interrupt;
 
 	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
 				NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -401,11 +398,19 @@ static __init int uv_rtc_setup_clock(void)
 
 	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
 	if (rc) {
-		clocksource_unregister(&clocksource_uv);
 		generic_interrupt_extension = NULL;
 		uv_rtc_deallocate_timers();
+		goto error;
 	}
 
+	printk(KERN_INFO "UV RTC clockevents registered\n");
+
+	return 0;
+
+error:
+	clocksource_unregister(&clocksource_uv);
+	printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
+
 	return rc;
 }
 arch_initcall(uv_rtc_setup_clock);
-- 
cgit v1.1


From 4a4de9c7d7111ce4caf422b856756125d8304f9d Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Wed, 14 Oct 2009 09:22:57 -0500
Subject: x86: UV RTC: Rename generic_interrupt to x86_platform_ipi

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20091014142257.GE11048@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S |  4 ++--
 arch/x86/kernel/irq.c      | 20 ++++++++++----------
 arch/x86/kernel/irqinit.c  |  4 ++--
 arch/x86/kernel/uv_time.c  | 10 +++++-----
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f..6714432 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -969,8 +969,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt GENERIC_INTERRUPT_VECTOR \
-	generic_interrupt smp_generic_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+	x86_platform_ipi smp_x86_platform_ipi
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3912061..9375dce 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
 atomic_t irq_err_count;
 
 /* Function pointer for generic interrupt vector handling */
-void (*generic_interrupt_extension)(void) = NULL;
+void (*x86_platform_ipi_callback)(void) = NULL;
 
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
 	seq_printf(p, "  Performance pending work\n");
 #endif
-	if (generic_interrupt_extension) {
+	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
 		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+			seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
 		seq_printf(p, "  Platform interrupts\n");
 	}
 #ifdef CONFIG_SMP
@@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_perf_irqs;
 	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
-	if (generic_interrupt_extension)
-		sum += irq_stats(cpu)->generic_irqs;
+	if (x86_platform_ipi_callback)
+		sum += irq_stats(cpu)->x86_platform_ipis;
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -252,9 +252,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 }
 
 /*
- * Handler for GENERIC_INTERRUPT_VECTOR.
+ * Handler for X86_PLATFORM_IPI_VECTOR.
  */
-void smp_generic_interrupt(struct pt_regs *regs)
+void smp_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
@@ -264,10 +264,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
 
 	irq_enter();
 
-	inc_irq_stat(generic_irqs);
+	inc_irq_stat(x86_platform_ipis);
 
-	if (generic_interrupt_extension)
-		generic_interrupt_extension();
+	if (x86_platform_ipi_callback)
+		x86_platform_ipi_callback();
 
 	run_local_timers();
 	irq_exit();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f3077..d593222 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-	/* generic IPI for platform specific use */
-	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+	/* IPI for X86 platform specific use */
+	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
 
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 2556450..3da7b1d 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -91,7 +91,7 @@ static void uv_rtc_send_IPI(int cpu)
 	pnode = uv_apicid_to_pnode(apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+	      (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
 
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
@@ -116,7 +116,7 @@ static int uv_setup_intr(int cpu, u64 expires)
 	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
 		UVH_EVENT_OCCURRED0_RTC1_MASK);
 
-	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
 		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
 
 	/* Set configuration */
@@ -364,7 +364,7 @@ static __init int uv_rtc_setup_clock(void)
 {
 	int rc;
 
-	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+	if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback)
 		return -ENODEV;
 
 	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
@@ -385,7 +385,7 @@ static __init int uv_rtc_setup_clock(void)
 	if (rc)
 		goto error;
 
-	generic_interrupt_extension = uv_rtc_interrupt;
+	x86_platform_ipi_callback = uv_rtc_interrupt;
 
 	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
 				NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -398,7 +398,7 @@ static __init int uv_rtc_setup_clock(void)
 
 	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
 	if (rc) {
-		generic_interrupt_extension = NULL;
+		x86_platform_ipi_callback = NULL;
 		uv_rtc_deallocate_timers();
 		goto error;
 	}
-- 
cgit v1.1


From b9af7c0d44b8bb71e3af5e94688d076414aa8c87 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 14 Oct 2009 14:46:55 -0700
Subject: x86-64: preserve large page mapping for 1st 2MB kernel txt with
 CONFIG_DEBUG_RODATA

In the first 2MB, kernel text is co-located with kernel static
page tables setup by head_64.S.  CONFIG_DEBUG_RODATA chops this
2MB large page mapping to small 4KB pages as we mark the kernel text as RO,
leaving the static page tables as RW.

With CONFIG_DEBUG_RODATA disabled, OLTP run on NHM-EP shows 1% improvement
with 2% reduction in system time and 1% improvement in iowait idle time.

To recover this, move the kernel static page tables to .data section, so that
we don't have to break the first 2MB of kernel text to small pages with
CONFIG_DEBUG_RODATA.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20091014220254.063193621@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd92..b55ee4f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
 	.quad	x86_64_start_kernel
 	ENTRY(initial_gs)
 	.quad	INIT_PER_CPU_VAR(irq_stack_union)
-	__FINITDATA
 
 	ENTRY(stack_start)
 	.quad  init_thread_union+THREAD_SIZE-8
 	.word  0
+	__FINITDATA
 
 bad_address:
 	jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
 	i = i + 1 ;					\
 	.endr
 
+	.data
 	/*
 	 * This default setting generates an ident mapping at address 0x100000
 	 * and a mapping for the kernel that precisely maps virtual address
-- 
cgit v1.1


From 74e081797bd9d2a7d8005fe519e719df343a2ba8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 14 Oct 2009 14:46:56 -0700
Subject: x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA

CONFIG_DEBUG_RODATA chops the large pages spanning boundaries of kernel
text/rodata/data to small 4KB pages as they are mapped with different
attributes (text as RO, RODATA as RO and NX etc).

On x86_64, preserve the large page mappings for kernel text/rodata/data
boundaries when CONFIG_DEBUG_RODATA is enabled. This is done by allowing the
RODATA section to be hugepage aligned and having same RWX attributes
for the 2MB page boundaries

Extra Memory pages padding the sections will be freed during the end of the boot
and the kernel identity mappings will have different RWX permissions compared to
the kernel text mappings.

Kernel identity mappings to these physical pages will be mapped with smaller
pages but large page mappings are still retained for kernel text,rodata,data
mappings.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20091014220254.190119924@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 92929fb..1476379 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,21 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+
+#define X64_ALIGN_DEBUG_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
+
+#define X64_ALIGN_DEBUG_RODATA_END				\
+		. = ALIGN(HPAGE_SIZE);				\
+		__end_rodata_hpage_align = .;
+
+#else
+
+#define X64_ALIGN_DEBUG_RODATA_BEGIN
+#define X64_ALIGN_DEBUG_RODATA_END
+
+#endif
+
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
 	data PT_LOAD FLAGS(7);          /* RWE */
@@ -90,7 +105,9 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
+	X64_ALIGN_DEBUG_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
+	X64_ALIGN_DEBUG_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-- 
cgit v1.1


From d6cc1c3af760c1d3f6b42f6e52b08718a6207cf1 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 19 Oct 2009 06:12:04 -0700
Subject: x86-64: add comment for RODATA large page retainment

Add a comment explaining why RODATA is aligned to 2 MB.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1476379..fd2dabe 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -42,7 +42,18 @@ jiffies_64 = jiffies;
 #endif
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
-
+/*
+ * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
+ * we retain large page mappings for boundaries spanning kernel text, rodata
+ * and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
+ * as well as retain 2MB large page mappings for kernel text.
+ */
 #define X64_ALIGN_DEBUG_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
 
 #define X64_ALIGN_DEBUG_RODATA_END				\
-- 
cgit v1.1


From 55ca3cc1746335bb6ef1d3894ddb6d0c729b3518 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 28 Oct 2009 18:46:57 -0800
Subject: x86_64, ftrace: Make ftrace use kernel identity mapping to modify
 code

On x86_64, kernel text mappings are mapped read-only with
CONFIG_DEBUG_RODATA. So use the kernel identity mapping instead
of the kernel text mapping to modify the kernel text.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20091029024821.080941108@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527..944e982 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -187,9 +187,26 @@ static void wait_for_nmi(void)
 	nmi_wait_count++;
 }
 
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
 static int
 do_ftrace_mod_code(unsigned long ip, void *new_code)
 {
+	/*
+	 * On x86_64, kernel text mappings are mapped read-only with
+	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+	 * of the kernel text mapping to modify the kernel text.
+	 *
+	 * For 32bit kernels, these mappings are same and we can use
+	 * kernel identity mapping to modify code.
+	 */
+	if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+		ip = (unsigned long)__va(__pa(ip));
+
 	mod_code_ip = (void *)ip;
 	mod_code_newcode = new_code;
 
-- 
cgit v1.1


From 6e18da75c28b592594fd632cf3e6eb09d3d078de Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Thu, 29 Oct 2009 14:47:42 +0100
Subject: x86, amd-ucode: Remove needless log messages

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20091029134742.GD30802@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index f4c538b..c043534 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -109,12 +109,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 		return 0;
 	}
 
-	if (mc_header->processor_rev_id != equiv_cpu_id) {
-		printk(KERN_ERR	"microcode: CPU%d: patch mismatch "
-		       "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
-		       cpu, mc_header->processor_rev_id, equiv_cpu_id);
+	if (mc_header->processor_rev_id != equiv_cpu_id)
 		return 0;
-	}
 
 	/* ucode might be chipset specific -- currently we don't support this */
 	if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
@@ -185,9 +181,6 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 
 	total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
 
-	printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
-	       size, total_size);
-
 	if (total_size > size || total_size > UCODE_MAX_SIZE) {
 		printk(KERN_ERR "microcode: error: size mismatch\n");
 		return NULL;
-- 
cgit v1.1


From d1c84f79a6ba992dc01e312c44a21496303874d6 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 10 Nov 2009 12:07:23 +0100
Subject: x86: ucode-amd: Load ucode-patches once and not separately of each
 CPU

This also implies that corresponding log messages, e.g.

  platform microcode: firmware: requesting amd-ucode/microcode_amd.bin

show up only once on module load and not when ucode is updated
for each CPU.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: dimm <dmitry.adamushko@gmail.com>
LKML-Reference: <20091110110723.GH30802@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c  | 24 +++++++++++++++++-------
 arch/x86/kernel/microcode_core.c |  6 ++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c043534..75538f6 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -33,6 +33,8 @@ MODULE_LICENSE("GPL v2");
 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
 #define UCODE_UCODE_TYPE           0x00000001
 
+const struct firmware *firmware;
+
 struct equiv_cpu_entry {
 	u32	installed_cpu;
 	u32	fixed_errata_mask;
@@ -301,14 +303,10 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
-	const char *fw_name = "amd-ucode/microcode_amd.bin";
-	const struct firmware *firmware;
 	enum ucode_state ret;
 
-	if (request_firmware(&firmware, fw_name, device)) {
-		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
+	if (firmware == NULL)
 		return UCODE_NFOUND;
-	}
 
 	if (*(u32 *)firmware->data != UCODE_MAGIC) {
 		printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n",
@@ -318,8 +316,6 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 
 	ret = generic_load_microcode(cpu, firmware->data, firmware->size);
 
-	release_firmware(firmware);
-
 	return ret;
 }
 
@@ -339,7 +335,21 @@ static void microcode_fini_cpu_amd(int cpu)
 	uci->mc = NULL;
 }
 
+void init_microcode_amd(struct device *device)
+{
+	const char *fw_name = "amd-ucode/microcode_amd.bin";
+	if (request_firmware(&firmware, fw_name, device))
+		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
+}
+
+void fini_microcode_amd(void)
+{
+	release_firmware(firmware);
+}
+
 static struct microcode_ops microcode_amd_ops = {
+	.init				  = init_microcode_amd,
+	.fini				  = fini_microcode_amd,
 	.request_microcode_user           = request_microcode_user,
 	.request_microcode_fw             = request_microcode_fw,
 	.collect_cpu_info                 = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8..d2a8160 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -520,6 +520,9 @@ static int __init microcode_init(void)
 		return PTR_ERR(microcode_pdev);
 	}
 
+	if (microcode_ops->init)
+		microcode_ops->init(&microcode_pdev->dev);
+
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
@@ -563,6 +566,9 @@ static void __exit microcode_exit(void)
 
 	platform_device_unregister(microcode_pdev);
 
+	if (microcode_ops->fini)
+		microcode_ops->fini();
+
 	microcode_ops = NULL;
 
 	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-- 
cgit v1.1


From 14c569425a0ae12cbeed72fdb8ebe78c48455dfd Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 10 Nov 2009 12:08:25 +0100
Subject: x86: ucode-amd: Don't warn when no ucode is available for a CPU
 revision

There is no point in warning when there is no ucode available
for a specific CPU revision. Currently the container-file, which
provides the AMD ucode patches for OS load, contains only a few
ucode patches.

It's already clearly indicated by the printed patch_level
whenever new ucode was available and an update happened. So the
warning message is of no help but rather annoying on systems
with many CPUs.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: dimm <dmitry.adamushko@gmail.com>
LKML-Reference: <20091110110825.GI30802@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 75538f6..9f13324 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -105,11 +105,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 		i++;
 	}
 
-	if (!equiv_cpu_id) {
-		printk(KERN_WARNING "microcode: CPU%d: cpu revision "
-		       "not listed in equivalent cpu table\n", cpu);
+	if (!equiv_cpu_id)
 		return 0;
-	}
 
 	if (mc_header->processor_rev_id != equiv_cpu_id)
 		return 0;
-- 
cgit v1.1


From 1a74357066369be91e6f4f431621a00b052df964 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 10 Nov 2009 12:09:20 +0100
Subject: x86: ucode-amd: Convert printk(KERN_*...) to pr_*(...)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: dimm <dmitry.adamushko@gmail.com>
LKML-Reference: <20091110110920.GJ30802@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 9f13324..26e33bd 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -78,12 +78,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 
 	memset(csig, 0, sizeof(*csig));
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
-		       "supported\n", cpu, c->x86);
+		pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
+			   "supported\n", cpu, c->x86);
 		return -1;
 	}
 	rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
-	printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
+	pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
 	return 0;
 }
 
@@ -113,7 +113,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 
 	/* ucode might be chipset specific -- currently we don't support this */
 	if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
-		printk(KERN_ERR "microcode: CPU%d: loading of chipset "
+		pr_err(KERN_ERR "microcode: CPU%d: loading of chipset "
 		       "specific code not yet supported\n", cpu);
 		return 0;
 	}
@@ -143,14 +143,12 @@ static int apply_microcode_amd(int cpu)
 
 	/* check current patch id and patch's id for match */
 	if (rev != mc_amd->hdr.patch_id) {
-		printk(KERN_ERR "microcode: CPU%d: update failed "
+		pr_err("microcode: CPU%d: update failed "
 		       "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
 		return -1;
 	}
 
-	printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
-	       cpu, rev);
-
+	pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
 	uci->cpu_sig.rev = rev;
 
 	return 0;
@@ -173,7 +171,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 		return NULL;
 
 	if (section_hdr[0] != UCODE_UCODE_TYPE) {
-		printk(KERN_ERR "microcode: error: invalid type field in "
+		pr_err("microcode: error: invalid type field in "
 		       "container file section header\n");
 		return NULL;
 	}
@@ -181,7 +179,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
 	total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
 
 	if (total_size > size || total_size > UCODE_MAX_SIZE) {
-		printk(KERN_ERR "microcode: error: size mismatch\n");
+		pr_err("microcode: error: size mismatch\n");
 		return NULL;
 	}
 
@@ -210,15 +208,14 @@ static int install_equiv_cpu_table(const u8 *buf)
 	size = buf_pos[2];
 
 	if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
-		printk(KERN_ERR "microcode: error: invalid type field in "
+		pr_err("microcode: error: invalid type field in "
 		       "container file section header\n");
 		return 0;
 	}
 
 	equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
 	if (!equiv_cpu_table) {
-		printk(KERN_ERR "microcode: failed to allocate "
-		       "equivalent CPU table\n");
+		pr_err("microcode: failed to allocate equivalent CPU table\n");
 		return 0;
 	}
 
@@ -251,8 +248,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 	offset = install_equiv_cpu_table(ucode_ptr);
 	if (!offset) {
-		printk(KERN_ERR "microcode: failed to create "
-		       "equivalent cpu table\n");
+		pr_err("microcode: failed to create equivalent cpu table\n");
 		return UCODE_ERROR;
 	}
 
@@ -306,7 +302,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 		return UCODE_NFOUND;
 
 	if (*(u32 *)firmware->data != UCODE_MAGIC) {
-		printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n",
+		pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n",
 		       *(u32 *)firmware->data);
 		return UCODE_ERROR;
 	}
@@ -319,8 +315,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 static enum ucode_state
 request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-	printk(KERN_INFO "microcode: AMD microcode update via "
-	       "/dev/cpu/microcode not supported\n");
+	pr_info("microcode: AMD microcode update via "
+		"/dev/cpu/microcode not supported\n");
 	return UCODE_ERROR;
 }
 
@@ -336,7 +332,7 @@ void init_microcode_amd(struct device *device)
 {
 	const char *fw_name = "amd-ucode/microcode_amd.bin";
 	if (request_firmware(&firmware, fw_name, device))
-		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
+		pr_err("microcode: failed to load file %s\n", fw_name);
 }
 
 void fini_microcode_amd(void)
-- 
cgit v1.1


From 9f15226e75583547aaf542c6be4bdac1060dd425 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Wed, 11 Nov 2009 20:03:29 +0100
Subject: x86, ucode-amd: Ensure ucode update on suspend/resume after CPU
 off/online cycle

When switching a CPU offline/online and then doing
suspend/resume, ucode is not updated on this CPU.

This is due to the microcode_fini_cpu() call which frees uci->mc
when setting the CPU offline:

  static void microcode_fini_cpu_amd(int cpu)
  {
          struct ucode_cpu_info *uci = ucode_cpu_info + cpu;

          vfree(uci->mc);
          uci->mc = NULL;
  }

When the CPU is set online uci->mc is still NULL because no
ucode update is required.

Finally this prevents ucode update when resuming after suspend:

  static enum ucode_state microcode_resume_cpu(int cpu)
  {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;

        if (!uci->mc)
                return UCODE_NFOUND;

        ...
  }

Fix is to check whether uci->mc is valid before
microcode_resume_cpu() is called.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: dimm <dmitry.adamushko@gmail.com>
LKML-Reference: <20091111190329.GF18592@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index d2a8160..adf2340 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -393,7 +393,7 @@ static enum ucode_state microcode_update_cpu(int cpu)
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	enum ucode_state ustate;
 
-	if (uci->valid)
+	if (uci->valid && uci->mc)
 		ustate = microcode_resume_cpu(cpu);
 	else
 		ustate = microcode_init_cpu(cpu);
-- 
cgit v1.1


From 196cf0d67acad70ebb2572da489d5cc7066cdd05 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 10 Nov 2009 18:27:23 -0800
Subject: x86: Make sure wakeup trampoline code is below 1MB

Instead of using bootmem, try find_e820_area()/reserve_early(),
and call acpi_reserve_memory() early, to allocate the wakeup
trampoline code area below 1M.

This is more reliable, and it also removes a dependency on
bootmem.

-v2: change function name to acpi_reserve_wakeup_memory(),
     as suggested by Rafael.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: pm list <linux-pm@lists.linux-foundation.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4AFA210B.3020207@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 15 +++++++++------
 arch/x86/kernel/setup.c      | 13 +++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638..4a41145 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -119,29 +119,32 @@ void acpi_restore_state_mem(void)
 
 
 /**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
  *
  * We allocate a page from the first 1MB of memory for the wakeup
  * routine for when we come back from a sleep state. The
  * runtime allocator allows specification of <16MB pages, but not
  * <1MB pages.
  */
-void __init acpi_reserve_bootmem(void)
+void __init acpi_reserve_wakeup_memory(void)
 {
+	unsigned long mem;
+
 	if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
 		printk(KERN_ERR
 		       "ACPI: Wakeup code way too big, S3 disabled.\n");
 		return;
 	}
 
-	acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
+	mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
 
-	if (!acpi_realmode) {
+	if (mem == -1L) {
 		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 		return;
 	}
-
-	acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
+	acpi_realmode = (unsigned long) phys_to_virt(mem);
+	acpi_wakeup_address = mem;
+	reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
 }
 
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f891419..0a6e94a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -897,6 +897,13 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_brk();
 
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 * even before init_memory_mapping
+	 */
+	acpi_reserve_wakeup_memory();
+#endif
 	init_gbpages();
 
 	/* max_pfn_mapped is updated here */
@@ -948,12 +955,6 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn, acpi, k8);
 
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
 	/*
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
-- 
cgit v1.1


From 24a065624dcdd91e8bfd0f14113feb91c7ed11ca Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 05:33:18 -0700
Subject: sysctl x86: Remove dead binary sysctl support

Now that sys_sysctl is a generic wrapper around /proc/sys  .ctl_name
and .strategy members of sysctl tables are dead code.  Remove them.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/kernel/vsyscall_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974..e02d92d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -237,7 +237,7 @@ static ctl_table kernel_table2[] = {
 };
 
 static ctl_table kernel_root_table2[] = {
-	{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+	{ .procname = "kernel", .mode = 0555,
 	  .child = kernel_table2 },
 	{}
 };
-- 
cgit v1.1


From 411462f62a65eeae7f451c6eb7a38b9d8759c61a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Nov 2009 11:52:39 +0100
Subject: x86: Fix printk format due to variable type change

clockevents.mult became u32. Fix the printk format.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97..cf4ee51 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -662,7 +662,7 @@ static int __init calibrate_APIC_clock(void)
 	calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
 
 	apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
-	apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult);
+	apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
 	apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
 		    calibration_result);
 
-- 
cgit v1.1


From 8a50e5135af0c243e117e94e27feb8d149c879b4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 13 Nov 2009 15:28:13 -0800
Subject: x86-32: Use symbolic constants, safer CPUID when enabling EFER.NX

Use symbolic constants rather than hard-coded values when setting
EFER.NX in head_32.S, and do a more rigorous test for the validity of
the response when probing for the extended CPUID range.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <1258154897-6770-2-git-send-email-hpa@zytor.com>
Acked-by: Kees Cook <kees.cook@canonical.com>
---
 arch/x86/kernel/head_32.S | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278..7fd318b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
 #include <asm/asm-offsets.h>
 #include <asm/setup.h>
 #include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeature.h>
 #include <asm/percpu.h>
 
 /* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
 	orl %edx,%eax
 	movl %eax,%cr4
 
-	btl $5, %eax		# check if PAE is enabled
-	jnc 6f
+	testb $X86_CR4_PAE, %al		# check if PAE is enabled
+	jz 6f
 
 	/* Check if extended functions are implemented */
 	movl $0x80000000, %eax
 	cpuid
-	cmpl $0x80000000, %eax
-	jbe 6f
+	/* Value must be in the range 0x80000001 to 0x8000ffff */
+	subl $0x80000001, %eax
+	cmpl $(0x8000ffff-0x80000001), %eax
+	ja 6f
 	mov $0x80000001, %eax
 	cpuid
 	/* Execute Disable bit supported? */
-	btl $20, %edx
+	btl $(X86_FEATURE_NX & 31), %edx
 	jnc 6f
 
 	/* Setup EFER (Extended Feature Enable Register) */
-	movl $0xc0000080, %ecx
+	movl $MSR_EFER, %ecx
 	rdmsr
 
-	btsl $11, %eax
+	btsl $_EFER_NX, %eax
 	/* Make changes effective */
 	wrmsr
 
-- 
cgit v1.1


From a7c4c0d934c6cbc58de262d090d4a715445453f0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 13 Nov 2009 15:28:14 -0800
Subject: x86, sleep: Always save the value of EFER

Always save the value of EFER, regardless of the state of NX.  Since
EFER may not actually exist, use rdmsr_safe() to do so.

v2: check the return value from rdmsr_safe() instead of relying on
    the output values being unchanged on error.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@tuxonice.net>
LKML-Reference: <1258154897-6770-3-git-send-email-hpa@zytor.com>
Acked-by: Kees Cook <kees.cook@canonical.com>
---
 arch/x86/kernel/acpi/sleep.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 4a41145..82e5086 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
 
-	header->pmode_efer_low = nx_enabled;
-	if (header->pmode_efer_low & 1) {
-		/* This is strange, why not save efer, always? */
-		rdmsr(MSR_EFER, header->pmode_efer_low,
-			header->pmode_efer_high);
-	}
+	if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
+		       &header->pmode_efer_high))
+		header->pmode_efer_low = header->pmode_efer_high = 0;
 #endif /* !CONFIG_64BIT */
 
 	header->pmode_cr0 = read_cr0();
-- 
cgit v1.1


From 583140afb989f24d115e80be5c91e503b58ccfc0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 13 Nov 2009 15:28:15 -0800
Subject: x86, pageattr: Make set_memory_(x|nx) aware of NX support

Make set_memory_x/set_memory_nx directly aware of if NX is supported
in the system or not, rather than requiring that every caller assesses
that support independently.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tim Starling <tstarling@wikimedia.org>
Cc: Hannes Eder <hannes@hanneseder.net>
LKML-Reference: <1258154897-6770-4-git-send-email-hpa@zytor.com>
Acked-by: Kees Cook <kees.cook@canonical.com>
---
 arch/x86/kernel/machine_kexec_32.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d..03657e7 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -157,8 +157,7 @@ int machine_kexec_prepare(struct kimage *image)
 {
 	int error;
 
-	if (nx_enabled)
-		set_pages_x(image->control_code_page, 1);
+	set_pages_x(image->control_code_page, 1);
 	error = machine_kexec_alloc_page_tables(image);
 	if (error)
 		return error;
@@ -172,8 +171,7 @@ int machine_kexec_prepare(struct kimage *image)
  */
 void machine_kexec_cleanup(struct kimage *image)
 {
-	if (nx_enabled)
-		set_pages_nx(image->control_code_page, 1);
+	set_pages_nx(image->control_code_page, 1);
 	machine_kexec_free_page_tables(image);
 }
 
-- 
cgit v1.1


From 4763ed4d45522b876c97e1f7f4b659d211f75571 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 13 Nov 2009 15:28:16 -0800
Subject: x86, mm: Clean up and simplify NX enablement

The 32- and 64-bit code used very different mechanisms for enabling
NX, but even the 32-bit code was enabling NX in head_32.S if it is
available.  Furthermore, we had a bewildering collection of tests for
the available of NX.

This patch:

a) merges the 32-bit set_nx() and the 64-bit check_efer() function
   into a single x86_configure_nx() function.  EFER control is left
   to the head code.

b) eliminates the nx_enabled variable entirely.  Things that need to
   test for NX enablement can verify __supported_pte_mask directly,
   and cpu_has_nx gives the supported status of NX.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Chris Wright <chrisw@sous-sol.org>
LKML-Reference: <1258154897-6770-5-git-send-email-hpa@zytor.com>
Acked-by: Kees Cook <kees.cook@canonical.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 arch/x86/kernel/setup.c      | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b..18346da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void)
 	wrmsrl(MSR_KERNEL_GS_BASE, 0);
 	barrier();
 
-	check_efer();
+	x86_configure_nx();
 	if (cpu != 0)
 		enable_x2apic();
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a6e94a..23b7f46 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -787,21 +787,17 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
-#ifdef CONFIG_X86_64
 	/*
 	 * Must call this twice: Once just to detect whether hardware doesn't
 	 * support NX (so that the early EHCI debug console setup can safely
 	 * call set_fixmap(), and then again after parsing early parameters to
 	 * honor the respective command line option.
 	 */
-	check_efer();
-#endif
+	x86_configure_nx();
 
 	parse_early_param();
 
-#ifdef CONFIG_X86_64
-	check_efer();
-#endif
+	x86_configure_nx();
 
 	/* Must be before kernel pagetables are setup */
 	vmi_activate();
-- 
cgit v1.1


From 4b0f3b81eb33ef18283aa71440cccfede1753ae0 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Fri, 13 Nov 2009 15:28:17 -0800
Subject: x86, mm: Report state of NX protections during boot

It is possible for x86_64 systems to lack the NX bit either due to the
hardware lacking support or the BIOS having turned off the CPU capability,
so NX status should be reported.  Additionally, anyone booting NX-capable
CPUs in 32bit mode without PAE will lack NX functionality, so this change
provides feedback for that case as well.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <1258154897-6770-6-git-send-email-hpa@zytor.com>
---
 arch/x86/kernel/setup.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 23b7f46..d2043a0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -788,16 +788,17 @@ void __init setup_arch(char **cmdline_p)
 	*cmdline_p = command_line;
 
 	/*
-	 * Must call this twice: Once just to detect whether hardware doesn't
-	 * support NX (so that the early EHCI debug console setup can safely
-	 * call set_fixmap(), and then again after parsing early parameters to
-	 * honor the respective command line option.
+	 * x86_configure_nx() is called before parse_early_param() to detect
+	 * whether hardware doesn't support NX (so that the early EHCI debug
+	 * console setup can safely call set_fixmap()). It may then be called
+	 * again from within noexec_setup() during parsing early parameters
+	 * to honor the respective command line option.
 	 */
 	x86_configure_nx();
 
 	parse_early_param();
 
-	x86_configure_nx();
+	x86_report_nx();
 
 	/* Must be before kernel pagetables are setup */
 	vmi_activate();
-- 
cgit v1.1


From 8cc2361bd00e87aab2827a3996a71fe9b2c9f9c4 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 17 Nov 2009 08:06:38 +0100
Subject: x86: ucode-amd: Move family check to microcde_amd.c's init function

... to avoid useless trial to load firmware on systems with
unsupported AMD CPUs.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andreas Mohr <andi@lisas.de>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20091117070638.GA27691@alberich.amd.com>
[ v2: changed BUG_ON() to WARN_ON() ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 26e33bd..63123d9 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -34,6 +34,7 @@ MODULE_LICENSE("GPL v2");
 #define UCODE_UCODE_TYPE           0x00000001
 
 const struct firmware *firmware;
+static int supported_cpu;
 
 struct equiv_cpu_entry {
 	u32	installed_cpu;
@@ -73,15 +74,12 @@ static struct equiv_cpu_entry *equiv_cpu_table;
 
 static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 {
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	u32 dummy;
 
-	memset(csig, 0, sizeof(*csig));
-	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
-			   "supported\n", cpu, c->x86);
+	if (!supported_cpu)
 		return -1;
-	}
+
+	memset(csig, 0, sizeof(*csig));
 	rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
 	pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
 	return 0;
@@ -331,6 +329,17 @@ static void microcode_fini_cpu_amd(int cpu)
 void init_microcode_amd(struct device *device)
 {
 	const char *fw_name = "amd-ucode/microcode_amd.bin";
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
+
+	if (c->x86 < 0x10) {
+		pr_warning("microcode: AMD CPU family 0x%x not supported\n",
+			   c->x86);
+		return;
+	}
+	supported_cpu = 1;
+
 	if (request_firmware(&firmware, fw_name, device))
 		pr_err("microcode: failed to load file %s\n", fw_name);
 }
-- 
cgit v1.1


From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Tue, 17 Nov 2009 13:49:50 +0800
Subject: timekeeping: Fix clock_gettime vsyscall time warp

Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier
to struct timekeeper" the clock multiplier of vsyscall is updated with
the unmodified clock multiplier of the clock source and not with the
NTP adjusted multiplier of the timekeeper.

This causes user space observerable time warps:
new CLOCK-warp maximum: 120 nsecs,  00000025c337c537 -> 00000025c337c4bf

Add a new argument "mult" to update_vsyscall() and hand in the
timekeeping internal NTP adjusted multiplier.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Tony Luck <tony.luck@intel.com>
LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974..62f39d7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	unsigned long flags;
 
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	vsyscall_gtod_data.clock.vread = clock->vread;
 	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
 	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = clock->mult;
+	vsyscall_gtod_data.clock.mult = mult;
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-- 
cgit v1.1


From 508d85c2c6bc8cba53d2a54d9a306ad64a0a80bf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 16 Nov 2009 23:04:56 -0800
Subject: x86: When cleaning MTRRs, do not fold WP into UC

The current MTRR code treats WP as a form of UC.  This really isn't
desirable behaviour, except possibly in the case of severe MTRR
shortage.  Disable this, to allow legitimate uses of WP to remain
unmolested.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c..6e49f6f 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -689,8 +689,6 @@ static int __init mtrr_need_cleanup(void)
 			continue;
 		if (!size)
 			type = MTRR_NUM_TYPES;
-		if (type == MTRR_TYPE_WRPROT)
-			type = MTRR_TYPE_UNCACHABLE;
 		num[type]++;
 	}
 
-- 
cgit v1.1


From 070e5c3f9989a72076e83fdd5ede3f0f3eb17264 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Nov 2009 12:27:47 +0100
Subject: x86: vmiclock: Fix printk format

clockevents.mult became u32. Fix the printk format.

Pointed-out-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmiclock_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2..74c92bb 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
 	evt->min_delta_ns = clockevent_delta2ns(1, evt);
 	evt->cpumask = cpumask_of(cpu);
 
-	printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+	printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
 	       evt->name, evt->mult, evt->shift);
 	clockevents_register_device(evt);
 }
-- 
cgit v1.1


From 350f8f5631922c7848ec4b530c111cb8c2ff7caa Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 13 Nov 2009 11:54:40 +0000
Subject: x86: Eliminate redundant/contradicting cache line size config options

Rather than having X86_L1_CACHE_BYTES and X86_L1_CACHE_SHIFT
(with inconsistent defaults), just having the latter suffices as
the former can be easily calculated from it.

To be consistent, also change X86_INTERNODE_CACHE_BYTES to
X86_INTERNODE_CACHE_SHIFT, and set it to 7 (128 bytes) for NUMA
to account for last level cache line size (which here matters
more than L1 cache line size).

Finally, make sure the default value for X86_L1_CACHE_SHIFT,
when X86_GENERIC is selected, is being seen before that for the
individual CPU model options (other than on x86-64, where
GENERIC_CPU is part of the choice construct, X86_GENERIC is a
separate option on ix86).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Ravikiran Thirumalai <kiran@scalex86.org>
Acked-by: Nick Piggin <npiggin@suse.de>
LKML-Reference: <4AFD5710020000780001F8F0@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index fd2dabe..eeb4f5f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -135,13 +135,13 @@ SECTIONS
 
 		PAGE_ALIGNED_DATA(PAGE_SIZE)
 
-		CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+		CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
 
 		DATA_DATA
 		CONSTRUCTORS
 
 		/* rarely changed data like cpu maps */
-		READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+		READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
 
 		/* End of data section */
 		_edata = .;
@@ -165,12 +165,12 @@ SECTIONS
 		*(.vsyscall_0)
 	} :user
 
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	. = ALIGN(L1_CACHE_BYTES);
 	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
 		*(.vsyscall_fn)
 	}
 
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	. = ALIGN(L1_CACHE_BYTES);
 	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
 		*(.vsyscall_gtod_data)
 	}
@@ -194,7 +194,7 @@ SECTIONS
 	}
 	vgetcpu_mode = VVIRT(.vgetcpu_mode);
 
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	. = ALIGN(L1_CACHE_BYTES);
 	.jiffies : AT(VLOAD(.jiffies)) {
 		*(.jiffies)
 	}
-- 
cgit v1.1


From 44280733e71ad15377735b42d8538c109c94d7e3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 22 Nov 2009 17:18:49 -0800
Subject: x86: Change crash kernel to reserve via reserve_early()

use find_e820_area()/reserve_early() instead.

-v2: address Eric's request, to restore original semantics.
     will fail, if the provided address can not be used.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <4B09E2F9.7040403@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 57 +++++++++++++------------------------------------
 1 file changed, 15 insertions(+), 42 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d2043a0..e3eae59 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -487,42 +487,11 @@ static void __init reserve_early_setup_data(void)
 
 #ifdef CONFIG_KEXEC
 
-/**
- * Reserve @size bytes of crashkernel memory at any suitable offset.
- *
- * @size: Size of the crashkernel memory to reserve.
- * Returns the base address on success, and -1ULL on failure.
- */
-static
-unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
-{
-	const unsigned long long alignment = 16<<20; 	/* 16M */
-	unsigned long long start = 0LL;
-
-	while (1) {
-		int ret;
-
-		start = find_e820_area(start, ULONG_MAX, size, alignment);
-		if (start == -1ULL)
-			return start;
-
-		/* try to reserve it */
-		ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
-		if (ret >= 0)
-			return start;
-
-		start += alignment;
-	}
-}
-
 static inline unsigned long long get_total_mem(void)
 {
 	unsigned long long total;
 
-	total = max_low_pfn - min_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	total += highend_pfn - highstart_pfn;
-#endif
+	total = max_pfn - min_low_pfn;
 
 	return total << PAGE_SHIFT;
 }
@@ -542,21 +511,25 @@ static void __init reserve_crashkernel(void)
 
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
-		crash_base = find_and_reserve_crashkernel(crash_size);
+		const unsigned long long alignment = 16<<20;	/* 16M */
+
+		crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
+				 alignment);
 		if (crash_base == -1ULL) {
-			pr_info("crashkernel reservation failed. "
-				"No suitable area found.\n");
+			pr_info("crashkernel reservation failed - No suitable area found.\n");
 			return;
 		}
 	} else {
-		ret = reserve_bootmem_generic(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE);
-		if (ret < 0) {
-			pr_info("crashkernel reservation failed - "
-				"memory is in use\n");
+		unsigned long long start;
+
+		start = find_e820_area(crash_base, ULONG_MAX, crash_size,
+				 1<<20);
+		if (start != crash_base) {
+			pr_info("crashkernel reservation failed - memory is in use.\n");
 			return;
 		}
 	}
+	reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
 
 	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
 			"for crashkernel (System RAM: %ldMB)\n",
@@ -927,6 +900,8 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_initrd();
 
+	reserve_crashkernel();
+
 	vsmp_init();
 
 	io_delay_init();
@@ -957,8 +932,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	find_smp_config();
 
-	reserve_crashkernel();
-
 #ifdef CONFIG_X86_64
 	/*
 	 * dma32_reserve_bootmem() allocates bootmem which may conflict
-- 
cgit v1.1


From d9c2d5ac6af87b4491bff107113aaf16f6c2b2d9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 21 Nov 2009 00:23:37 -0800
Subject: x86, numa: Use near(er) online node instead of roundrobin for NUMA

CPU to node mapping is set via the following sequence:

 1. numa_init_array(): Set up roundrobin from cpu to online node

 2. init_cpu_to_node(): Set that according to apicid_to_node[]
			according to srat only handle the node that
			is online, and leave other cpu on node
			without ram (aka not online) to still
			roundrobin.

3. later call srat_detect_node for Intel/AMD, will use first_online
   node or nearby node.

Problem is that setup_per_cpu_areas() is not called between 2 and 3,
the per_cpu for cpu on node with ram is on different node, and could
put that on node with two hops away.

So try to optimize this and add find_near_online_node() and call
init_cpu_to_node().

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4B07A739.3030104@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835..c900b73 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
 	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
+	if (node == NUMA_NO_NODE)
 		node = first_node(node_online_map);
+	else if (!node_online(node)) {
+		/* reuse the value from init_cpu_to_node() */
+		node = cpu_to_node(cpu);
+	}
 	numa_set_node(cpu, node);
 
 	printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
-- 
cgit v1.1


From e38e2af1c57c3eb5211331a5b4fcaae0c4a2a918 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 19 Nov 2009 17:12:43 -0600
Subject: x86: SGI UV: Fix BAU initialization

A memory mapped register that affects the SGI UV Broadcast
Assist Unit's interrupt handling may sometimes be unintialized.

Remove the condition on its initialization, as that condition
can be randomly satisfied by a hardware reset.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <E1NBGB9-0005nU-Dp@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 503c1f2..af21e55 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -819,10 +819,8 @@ static int __init uv_init_blade(int blade)
 	 */
 	apicid = blade_to_first_apicid(blade);
 	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
-	if ((pa & 0xff) != UV_BAU_MESSAGE) {
-		uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
 				      ((apicid << 32) | UV_BAU_MESSAGE));
-	}
 	return 0;
 }
 
-- 
cgit v1.1


From 581f202bcd60acbc3af1f5faa429e570c512f8a3 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 20 Nov 2009 15:48:26 -0600
Subject: x86: UV RTC: Always enable RTC clocksource

Always enable the RTC clocksource on UV systems.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
LKML-Reference: <20091120214826.GA20016@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/uv_time.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 3da7b1d..3c84aa0 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,6 @@ struct uv_rtc_timer_head {
  */
 static struct uv_rtc_timer_head		**blade_info __read_mostly;
 
-static int				uv_rtc_enable;
 static int				uv_rtc_evt_enable;
 
 /*
@@ -335,14 +334,6 @@ static void uv_rtc_interrupt(void)
 	ced->event_handler(ced);
 }
 
-static int __init uv_enable_rtc(char *str)
-{
-	uv_rtc_enable = 1;
-
-	return 1;
-}
-__setup("uvrtc", uv_enable_rtc);
-
 static int __init uv_enable_evt_rtc(char *str)
 {
 	uv_rtc_evt_enable = 1;
@@ -364,12 +355,16 @@ static __init int uv_rtc_setup_clock(void)
 {
 	int rc;
 
-	if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback)
+	if (!is_uv_system())
 		return -ENODEV;
 
 	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
 				clocksource_uv.shift);
 
+	/* If single blade, prefer tsc */
+	if (uv_num_possible_blades() == 1)
+		clocksource_uv.rating = 250;
+
 	rc = clocksource_register(&clocksource_uv);
 	if (rc)
 		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
@@ -377,7 +372,7 @@ static __init int uv_rtc_setup_clock(void)
 		printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
 			sn_rtc_cycles_per_second/(unsigned long)1E6);
 
-	if (rc || !uv_rtc_evt_enable)
+	if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
 		return rc;
 
 	/* Setup and register clockevents */
-- 
cgit v1.1


From fd12a0d69aee6d90fa9b9890db24368a897f8423 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 19 Nov 2009 14:23:41 -0600
Subject: x86: UV SGI: Don't track GRU space in PAT

GRU space is always mapped as WB in the page table. There is
no need to track the mappings in the PAT. This also eliminates
the "freeing invalid memtype" messages when the GRU space is
unmapped.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20091119202341.GA4420@sgi.com>
[ v2: fix build failure ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 19 ++++++++++++++++++-
 arch/x86/kernel/x86_init.c         |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f5f5886..2477c9f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 #include <asm/smp.h>
+#include <asm/x86_init.h>
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
 static enum uv_system_type uv_system_type;
+static u64 gru_start_paddr, gru_end_paddr;
+
+static int is_GRU_range(u64 start, u64 end)
+{
+	return start >= gru_start_paddr && end < gru_end_paddr;
+}
+
+static int uv_is_untracked_pat_range(u64 start, u64 end)
+{
+	return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
 
 static int early_get_nodeid(void)
 {
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strcmp(oem_id, "SGI")) {
+		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
 		if (!strcmp(oem_table_id, "UVL"))
 			uv_system_type = UV_LEGACY_APIC;
 		else if (!strcmp(oem_table_id, "UVX"))
@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
 	int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
 
 	gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
-	if (gru.s.enable)
+	if (gru.s.enable) {
 		map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
+		gru_start_paddr = ((u64)gru.s.base << shift);
+		gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+
+	}
 }
 
 static __init void map_mmr_high(int max_pnode)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a..bcc749e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,6 +13,7 @@
 #include <asm/e820.h>
 #include <asm/time.h>
 #include <asm/irq.h>
+#include <asm/pat.h>
 #include <asm/tsc.h>
 
 void __cpuinit x86_init_noop(void) { }
@@ -69,6 +70,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
 };
 
 struct x86_platform_ops x86_platform = {
+	.is_untracked_pat_range		= default_is_untracked_pat_range,
 	.calibrate_tsc			= native_calibrate_tsc,
 	.get_wallclock			= mach_get_cmos_time,
 	.set_wallclock			= mach_set_rtc_mmss,
-- 
cgit v1.1


From eb41c8be89dbe079f49202774e04a79ccac48a09 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 23 Nov 2009 14:46:07 -0800
Subject: x86, platform: Change is_untracked_pat_range() to bool; cleanup init

- Change is_untracked_pat_range() to return bool.
- Clean up the initialization of is_untracked_pat_range() -- by default,
  we simply point it at is_ISA_range() directly.
- Move is_untracked_pat_range to the end of struct x86_platform, since
  it is the newest field.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20091119202341.GA4420@sgi.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++--
 arch/x86/kernel/x86_init.c         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2477c9f..597a47b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -37,12 +37,12 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
 static enum uv_system_type uv_system_type;
 static u64 gru_start_paddr, gru_end_paddr;
 
-static int is_GRU_range(u64 start, u64 end)
+static inline bool is_GRU_range(u64 start, u64 end)
 {
 	return start >= gru_start_paddr && end < gru_end_paddr;
 }
 
-static int uv_is_untracked_pat_range(u64 start, u64 end)
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
 {
 	return is_ISA_range(start, end) || is_GRU_range(start, end);
 }
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index bcc749e..861b8b5 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,8 +70,8 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
 };
 
 struct x86_platform_ops x86_platform = {
-	.is_untracked_pat_range		= default_is_untracked_pat_range,
 	.calibrate_tsc			= native_calibrate_tsc,
 	.get_wallclock			= mach_get_cmos_time,
 	.set_wallclock			= mach_set_rtc_mmss,
+	.is_untracked_pat_range		= is_ISA_range,
 };
-- 
cgit v1.1


From b24c2a925a9837cccf54d50aeac22ba0cbc15455 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 24 Nov 2009 02:48:18 -0800
Subject: x86: Move find_smp_config() earlier and avoid bootmem usage

Move the find_smp_config() call to before bootmem is initialized.
Use reserve_early() instead of reserve_bootmem() in it.

This simplifies the code, we only need to call find_smp_config()
once and can remove the now unneeded reserve parameter from
x86_init_mpparse::find_smp_config.

We thus also reduce x86's dependency on bootmem allocations.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4B0BB9F2.70907@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/numaq_32.c |  5 -----
 arch/x86/kernel/mpparse.c       | 44 +++++++++++------------------------------
 arch/x86/kernel/setup.c         | 10 +++++-----
 arch/x86/kernel/visws_quirks.c  |  2 +-
 4 files changed, 18 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2..9c0629c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 static __init void early_check_numaq(void)
 {
 	/*
-	 * Find possible boot-time SMP configuration:
-	 */
-	early_find_smp_config();
-
-	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef..35a57c9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
 	 */
 }
 
-static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
 {
 	unsigned long size = get_mpc_size(mpf->physptr);
-#ifdef CONFIG_X86_32
-	/*
-	 * We cannot access to MPC table to compute table size yet,
-	 * as only few megabytes from the bottom is mapped now.
-	 * PC-9800's MPC table places on the very last of physical
-	 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
-	 * yields BUG() in reserve_bootmem.
-	 * also need to make sure physptr is below than max_low_pfn
-	 * we don't need reserve the area above max_low_pfn
-	 */
-	unsigned long end = max_low_pfn * PAGE_SIZE;
 
-	if (mpf->physptr < end) {
-		if (mpf->physptr + size > end)
-			size = end - mpf->physptr;
-		reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
-	}
-#else
-	reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
-#endif
+	reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
 }
 
-static int __init smp_scan_config(unsigned long base, unsigned long length,
-				  unsigned reserve)
+static int __init smp_scan_config(unsigned long base, unsigned long length)
 {
 	unsigned int *bp = phys_to_virt(base);
 	struct mpf_intel *mpf;
+	unsigned long mem;
 
 	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
 			bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
 			       mpf, (u64)virt_to_phys(mpf));
 
-			if (!reserve)
-				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
-						BOOTMEM_DEFAULT);
+			mem = virt_to_phys(mpf);
+			reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
 			if (mpf->physptr)
-				smp_reserve_bootmem(mpf);
+				smp_reserve_memory(mpf);
 
 			return 1;
 		}
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	return 0;
 }
 
-void __init default_find_smp_config(unsigned int reserve)
+void __init default_find_smp_config(void)
 {
 	unsigned int address;
 
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
 	 * 2) Scan the top 1K of base RAM
 	 * 3) Scan the 64K of bios
 	 */
-	if (smp_scan_config(0x0, 0x400, reserve) ||
-	    smp_scan_config(639 * 0x400, 0x400, reserve) ||
-	    smp_scan_config(0xF0000, 0x10000, reserve))
+	if (smp_scan_config(0x0, 0x400) ||
+	    smp_scan_config(639 * 0x400, 0x400) ||
+	    smp_scan_config(0xF0000, 0x10000))
 		return;
 	/*
 	 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
 
 	address = get_bios_ebda();
 	if (address)
-		smp_scan_config(address, 0x400, reserve);
+		smp_scan_config(address, 0x400);
 }
 
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e3eae59..cdb6a8a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -913,6 +913,11 @@ void __init setup_arch(char **cmdline_p)
 
 	early_acpi_boot_init();
 
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+
 #ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
@@ -927,11 +932,6 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn, acpi, k8);
 
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-
 #ifdef CONFIG_X86_64
 	/*
 	 * dma32_reserve_bootmem() allocates bootmem which may conflict
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553..1498efa 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 	apic_version[m->apicid] = ver;
 }
 
-static void __init visws_find_smp_config(unsigned int reserve)
+static void __init visws_find_smp_config(void)
 {
 	struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
 	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-- 
cgit v1.1


From 5bf65b9ba67226eae9ffc398a0369fc4da35c259 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 24 Nov 2009 02:46:59 -0800
Subject: x86, mtrr: Fix sorting of mtrr after subtracting

In some cases we can coalesce MTRR entries after cleanup; this may
allow us to have more entries.  As such, introduce clean_sort_range to
to sort and coaelsce the MTRR entries.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4B0BB9A3.5020908@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 49 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 6e49f6f..6987af7 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
 	return start1 - start2;
 }
 
+static int __init clean_sort_range(struct res_range *range, int az)
+{
+	int i, j, k = az - 1, nr_range = 0;
+
+	for (i = 0; i < k; i++) {
+		if (range[i].end)
+			continue;
+		for (j = k; j > i; j--) {
+			if (range[j].end) {
+				k = j;
+				break;
+			}
+		}
+		if (j == i)
+			break;
+		range[i].start = range[k].start;
+		range[i].end   = range[k].end;
+		range[k].start = 0;
+		range[k].end   = 0;
+		k--;
+	}
+	/* count it */
+	for (i = 0; i < az; i++) {
+		if (!range[i].end) {
+			nr_range = i;
+			break;
+		}
+	}
+
+	/* sort them */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+	return nr_range;
+}
+
 #define BIOS_BUG_MSG KERN_WARNING \
 	"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
 
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 		subtract_range(range, extra_remove_base,
 				 extra_remove_base + extra_remove_size  - 1);
 
-	/* get new range num */
-	nr_range = 0;
-	for (i = 0; i < RANGE_NUM; i++) {
-		if (!range[i].end)
-			continue;
-		nr_range++;
-	}
 	if  (debug_print) {
 		printk(KERN_DEBUG "After UC checking\n");
-		for (i = 0; i < nr_range; i++)
+		for (i = 0; i < RANGE_NUM; i++) {
+			if (!range[i].end)
+				continue;
 			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
 				 range[i].start, range[i].end + 1);
+		}
 	}
 
 	/* sort the ranges */
-	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	nr_range = clean_sort_range(range, RANGE_NUM);
 	if  (debug_print) {
 		printk(KERN_DEBUG "After sorting\n");
 		for (i = 0; i < nr_range; i++)
-- 
cgit v1.1


From 18ed61da985c57eea3fe8038b13fa2837c9b3c3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 27 Nov 2009 15:24:44 +0100
Subject: x86: hpet: Make WARN_ON understandable

Andrew complained rightly that the WARN_ON in hpet_next_event() is
confusing and the code comment not really helpful.

Change it to WARN_ONCE and print the reason in clear text. Change the
comment to explain what kind of hardware wreckage we deal with.

Pointed-out-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com>
---
 arch/x86/kernel/hpet.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7f024ff..ba6e658 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -384,11 +384,22 @@ static int hpet_next_event(unsigned long delta,
 	hpet_writel(cnt, HPET_Tn_CMP(timer));
 
 	/*
-	 * We need to read back the CMP register to make sure that
-	 * what we wrote hit the chip before we compare it to the
-	 * counter.
+	 * We need to read back the CMP register on certain HPET
+	 * implementations (ATI chipsets) which seem to delay the
+	 * transfer of the compare register into the internal compare
+	 * logic. With small deltas this might actually be too late as
+	 * the counter could already be higher than the compare value
+	 * at that point and we would wait for the next hpet interrupt
+	 * forever. We found out that reading the CMP register back
+	 * forces the transfer so we can rely on the comparison with
+	 * the counter register below. If the read back from the
+	 * compare register does not match the value we programmed
+	 * then we might have a real hardware problem. We can not do
+	 * much about it here, but at least alert the user/admin with
+	 * a prominent warning.
 	 */
-	WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt);
+	WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
+		  KERN_WARNING "hpet: compare register read back failed.\n");
 
 	return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
 }
-- 
cgit v1.1


From ccef086454d4c97e7b722e9303390207d681cb4c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 30 Nov 2009 21:33:51 -0800
Subject: x86, mm: Correct the implementation of is_untracked_pat_range()

The semantics the PAT code expect of is_untracked_pat_range() is "is
this range completely contained inside the untracked region."  This
means that checkin 8a27138924f64d2f30c1022f909f74480046bc3f was
technically wrong, because the implementation needlessly confusing.

The sane interface is for it to take a semiclosed range like just
about everything else (as evidenced by the sheer number of "- 1"'s
removed by that patch) so change the actual implementation to match.

Reported-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jack Steiner <steiner@sgi.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <20091119202341.GA4420@sgi.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 597a47b..1e09417 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -39,7 +39,7 @@ static u64 gru_start_paddr, gru_end_paddr;
 
 static inline bool is_GRU_range(u64 start, u64 end)
 {
-	return start >= gru_start_paddr && end < gru_end_paddr;
+	return start >= gru_start_paddr && end <= gru_end_paddr;
 }
 
 static bool uv_is_untracked_pat_range(u64 start, u64 end)
-- 
cgit v1.1


From 57fea8f7ab67ef42b7f84999e49e47f8717a2d5b Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Thu, 3 Dec 2009 19:06:40 +0800
Subject: x86/reboot: Add pci_dev_put in reboot_fixup_32.c for consistency

pci_get_device will increase the ref count of found device.
Although we're going to reset soon, we should use pci_dev_put
to decrease the ref count for consistency.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <1259838400-23833-1-git-send-email-dfeng@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot_fixups_32.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a8377..201eab6 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
 			continue;
 
 		cur->reboot_fixup(dev);
+		pci_dev_put(dev);
 	}
 }
 
-- 
cgit v1.1