From 6fd592daae2182adc47f405e20d07f34f52d07dd Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" <crmafra2@gmail.com> Date: Mon, 5 May 2008 20:11:22 -0300 Subject: x86: clean up computation of HPET .mult variables While reading through the HPET code I realized that the computation of .mult variables could be done with less lines of code, resulting in a 1.6% text size saving for hpet.o So I propose the following patch, which applies against today's Linus -git tree. >From 0c6507e400e9ca5f7f14331e18f8c12baf75a9d3 Mon Sep 17 00:00:00 2001 From: Carlos R. Mafra <crmafra@ift.unesp.br> Date: Mon, 5 May 2008 19:38:53 -0300 The computation of clocksource_hpet.mult tmp = (u64)hpet_period << HPET_SHIFT; do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; can be streamlined if we note that it is equal to clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); Furthermore, the computation of hpet_clockevent.mult uint64_t hpet_freq; hpet_freq = 1000000000000000ULL; do_div(hpet_freq, hpet_period); hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, hpet_clockevent.shift); can also be streamlined with the observation that hpet_period and hpet_freq are inverse to each other (in proper units). So instead of computing hpet_freq and using (schematically) div_sc(hpet_freq, 10^9, shift) we use the trick of calling with the arguments in reverse order, div_sc(10^6, hpet_period, shift). The different power of ten is due to frequency being in Hertz (1/sec) and the period being in units of femtosecond. Explicitly, mult = (hpet_freq * 2^shift)/10^9 (before) mult = (10^6 * 2^shift)/hpet_period (after) because hpet_freq = 10^15/hpet_period. The comments in the code are also updated to reflect the changes. As a result, text data bss dec hex filename 2957 425 92 3474 d92 arch/x86/kernel/hpet.o 3006 425 92 3523 dc3 arch/x86/kernel/hpet.o.old a 1.6% reduction in text size. Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/hpet.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 9b5cfcd..ea230ec 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -17,7 +17,7 @@ /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000 +#define FSEC_PER_NSEC 1000000L /* * HPET address is set in acpi/boot.c, when an ACPI entry exists @@ -206,20 +206,19 @@ static void hpet_enable_legacy_int(void) static void hpet_legacy_clockevent_register(void) { - uint64_t hpet_freq; - /* Start HPET legacy interrupts */ hpet_enable_legacy_int(); /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. + * The mult factor is defined as (include/linux/clockchips.h) + * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) + * hpet_period is in units of femtoseconds (per cycle), so + * mult/2^shift = cyc/ns = 10^6/hpet_period + * mult = (10^6 * 2^shift)/hpet_period + * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period */ - hpet_freq = 1000000000000000ULL; - do_div(hpet_freq, hpet_period); - hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, hpet_clockevent.shift); + hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, + hpet_period, hpet_clockevent.shift); /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); @@ -324,7 +323,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { - u64 tmp, start, now; + u64 start, now; cycle_t t1; /* Start the counter */ @@ -351,21 +350,15 @@ static int hpet_clocksource_register(void) return -ENODEV; } - /* Initialize and register HPET clocksource - * - * hpet period is in femto seconds per cycle - * so we need to convert this to ns/cyc units - * approximated by mult/2^shift - * - * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift - * fsec/cyc * 1ns/1000000fsec * 2^shift = mult - * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult - * (fsec/cyc << shift)/1000000 = mult - * (hpet_period << shift)/FSEC_PER_NSEC = mult + /* + * The definition of mult is (include/linux/clocksource.h) + * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc + * so we first need to convert hpet_period to ns/cyc units: + * mult/2^shift = ns/cyc = hpet_period/10^6 + * mult = (hpet_period * 2^shift)/10^6 + * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - tmp = (u64)hpet_period << HPET_SHIFT; - do_div(tmp, FSEC_PER_NSEC); - clocksource_hpet.mult = (u32)tmp; + clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); clocksource_register(&clocksource_hpet); -- cgit v1.1 From e8aa4667baf74dfd85fbaab86861465acb811085 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 9 May 2008 11:49:11 +0200 Subject: x86: enable hpet=force for AMD SB400 Add quirk to allow forced usage of HPET on ATI SB400. I stumbled over machines where HPET is enabled but not reported by BIOS. This patch configures the HPET base address and makes it known to the OS. Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/quirks.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d89a648..5fe6bd5 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -65,6 +65,7 @@ static enum { ICH_FORCE_HPET_RESUME, VT8237_FORCE_HPET_RESUME, NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, } force_hpet_resume_type; static void __iomem *rcba_base; @@ -330,6 +331,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, vt8237_force_enable_hpet); +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (!hpet_force_user || hpet_address || force_hpet_address) + return; + + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + /* * Undocumented chipset feature taken from LinuxBIOS. */ @@ -397,6 +423,9 @@ void force_hpet_resume(void) case NVIDIA_FORCE_HPET_RESUME: nvidia_force_hpet_resume(); return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; default: break; } -- cgit v1.1 From 7c4728f4a865067d96fb84f1d9c65e0ccd1f355d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 10 May 2008 21:42:14 +0200 Subject: x86: print info about available HPET quirk We have a lot of HPET quirks available which might force enable HPET even when the BIOS does not enable it. Some of those quirks depend on the command line option "hpet=force". Andrew pointed out that hoping that the user will find out about this boot option is not really helpful. Emit a kernel info which informs the user about the "hpet=force" boot option when we enter a quirk which depends on this option and the user did not provide it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/quirks.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 5fe6bd5..ddbf34d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -175,6 +175,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, static struct pci_dev *cached_dev; +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + static void old_ich_force_hpet_resume(void) { u32 val; @@ -254,6 +260,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev) { if (hpet_force_user) old_ich_force_enable_hpet(dev); + else + hpet_print_force_info(); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, @@ -291,8 +299,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_read_config_dword(dev, 0x68, &val); /* @@ -341,8 +354,13 @@ static void ati_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); @@ -369,8 +387,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) { u32 uninitialized_var(val); - if (!hpet_force_user || hpet_address || force_hpet_address) + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); return; + } pci_write_config_dword(dev, 0x44, 0xfed00001); pci_read_config_dword(dev, 0x44, &val); -- cgit v1.1 From 8edc5cc5ec880c96de8e6686fb0d7a5231e91c05 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Mon, 12 May 2008 15:43:34 +0200 Subject: x86: remove 6 bank limitation in 64 bit MCE reporting code Eliminate the 6 bank restriction in 64 bit mce reporting code. This restriction is artificial (due to static creation of sysfs files) and 32 bit code does not have any such restriction. This change helps in reporting the details of machine checks on a machine check exception with errors in bank 6 and above on CPUs that support those banks. Without the patch, machine check errors in those banks are not reported. We still have 128 (MCE_EXTENDED_BANK) bank restriction instead of max 256 supported in hardware. That is not changed in the patch below as it will have some user level mcelog utility dependency, with bank 128 being used for thermal reporting currently. The patch below does not create sysfs control (bankNctl) for banks higher than 6 as well. That needs some pre-cleanup in /sysfs mce layout, removal of per cpu /sysfs entries for bankctl as they are really global system level control today. That change will follow. This basic change is critical to report the detailed errors on banks higher than 6. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce_64.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e07e8c0..f1f3f5e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -31,7 +31,7 @@ #include <asm/idle.h> #define MISC_MCELOG_MINOR 227 -#define NR_BANKS 6 +#define NR_SYSFS_BANKS 6 atomic_t mce_entry; @@ -46,7 +46,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (i < NR_SYSFS_BANKS && !bank[i]) continue; m.misc = 0; @@ -444,9 +444,10 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; + if (banks > MCE_EXTENDED_BANK) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", + MCE_EXTENDED_BANK); + banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) @@ -462,7 +463,7 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -766,7 +767,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* TBD should generate these dynamically based on number of available banks */ +/* + * TBD should generate these dynamically based on number of available banks. + * Have only 6 contol banks in /sysfs until then. + */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) -- cgit v1.1 From 1c7d06d419dbe82c76fbb4d3e1fa61b2da2dc00b Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 30 Apr 2008 22:12:05 +0200 Subject: revert: thread_info.h change temporarily revert parts of "signals: x86 TS_RESTORE_SIGMASK". Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/thread_info_32.h | 13 ++----------- include/asm-x86/thread_info_64.h | 13 ++----------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h index b633882..5318599 100644 --- a/include/asm-x86/thread_info_32.h +++ b/include/asm-x86/thread_info_32.h @@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_EMU 5 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ #define TIF_SECCOMP 7 /* secure computing */ +#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ #define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ @@ -150,6 +151,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) @@ -186,20 +188,9 @@ static inline struct thread_info *current_thread_info(void) this quantum (SMP) */ #define TS_POLLING 0x0002 /* True if in idle loop and not sleeping */ -#define TS_RESTORE_SIGMASK 0x0004 /* restore signal mask in do_signal() */ #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) -#ifndef __ASSEMBLY__ -#define HAVE_SET_RESTORE_SIGMASK 1 -static inline void set_restore_sigmask(void) -{ - struct thread_info *ti = current_thread_info(); - ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); -} -#endif /* !__ASSEMBLY__ */ - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h index cb69f70..ed664e8 100644 --- a/include/asm-x86/thread_info_64.h +++ b/include/asm-x86/thread_info_64.h @@ -109,6 +109,7 @@ static inline struct thread_info *stack_thread_info(void) #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ /* 16 free */ @@ -132,6 +133,7 @@ static inline struct thread_info *stack_thread_info(void) #define _TIF_IRET (1 << TIF_IRET) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) #define _TIF_IA32 (1 << TIF_IA32) @@ -176,20 +178,9 @@ static inline struct thread_info *stack_thread_info(void) #define TS_COMPAT 0x0002 /* 32bit syscall active */ #define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */ -#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) -#ifndef __ASSEMBLY__ -#define HAVE_SET_RESTORE_SIGMASK 1 -static inline void set_restore_sigmask(void) -{ - struct thread_info *ti = current_thread_info(); - ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); -} -#endif /* !__ASSEMBLY__ */ - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ -- cgit v1.1 From 2052e8d40ad58b1d364f900e70edfda62caa0874 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 12 May 2008 15:43:41 +0200 Subject: x86: merge thread_info.h Simple merge of both thread_info_32.h and thread_info_64.h into thread_info.h. Comments for #ifndef __ASM_THREAD_INFO_H and #ifdef __KERNEL__ are the same. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/thread_info.h | 366 ++++++++++++++++++++++++++++++++++++++- include/asm-x86/thread_info_32.h | 196 --------------------- include/asm-x86/thread_info_64.h | 186 -------------------- 3 files changed, 364 insertions(+), 384 deletions(-) delete mode 100644 include/asm-x86/thread_info_32.h delete mode 100644 include/asm-x86/thread_info_64.h diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 77244f1..d59384a 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -1,10 +1,372 @@ +/* thread_info.h: low-level thread information + * + * Copyright (C) 2002 David Howells (dhowells@redhat.com) + * - Incorporating suggestions made by Linus Torvalds and Dave Miller + */ + #ifndef _ASM_X86_THREAD_INFO_H +#define _ASM_X86_THREAD_INFO_H + #ifdef CONFIG_X86_32 -# include "thread_info_32.h" +#include <linux/compiler.h> +#include <asm/page.h> + +#ifndef __ASSEMBLY__ +#include <asm/processor.h> +#endif + +/* + * low level task data that entry.S needs immediate access to + * - this struct should fit entirely inside of one cache line + * - this struct shares the supervisor stack pages + * - if the contents of this structure are changed, + * the assembly constants must also be changed + */ +#ifndef __ASSEMBLY__ + +struct thread_info { + struct task_struct *task; /* main task structure */ + struct exec_domain *exec_domain; /* execution domain */ + unsigned long flags; /* low level flags */ + unsigned long status; /* thread-synchronous flags */ + __u32 cpu; /* current CPU */ + int preempt_count; /* 0 => preemptable, + <0 => BUG */ + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF user-thread + 0-0xFFFFFFFF kernel-thread + */ + void *sysenter_return; + struct restart_block restart_block; + unsigned long previous_esp; /* ESP of the previous stack in + case of nested (IRQ) stacks + */ + __u8 supervisor_stack[0]; +}; + +#else /* !__ASSEMBLY__ */ + +#include <asm/asm-offsets.h> + +#endif + +#define PREEMPT_ACTIVE 0x10000000 +#ifdef CONFIG_4KSTACKS +#define THREAD_SIZE (4096) #else -# include "thread_info_64.h" +#define THREAD_SIZE (8192) #endif +#define STACK_WARN (THREAD_SIZE/8) +/* + * macros/functions for gaining access to the thread information structure + * + * preempt_count needs to be 1 initially, until the scheduler is functional. + */ +#ifndef __ASSEMBLY__ + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ +} + +#define init_thread_info (init_thread_union.thread_info) +#define init_stack (init_thread_union.stack) + + +/* how to get the current stack pointer from C */ +register unsigned long current_stack_pointer asm("esp") __used; + +/* how to get the thread information struct from C */ +static inline struct thread_info *current_thread_info(void) +{ + return (struct thread_info *) + (current_stack_pointer & ~(THREAD_SIZE - 1)); +} + +/* thread information allocation */ +#ifdef CONFIG_DEBUG_STACK_USAGE +#define alloc_thread_info(tsk) ((struct thread_info *) \ + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(THREAD_SIZE))) +#else +#define alloc_thread_info(tsk) ((struct thread_info *) \ + __get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE))) +#endif + +#else /* !__ASSEMBLY__ */ + +/* how to get the thread information struct from ASM */ +#define GET_THREAD_INFO(reg) \ + movl $-THREAD_SIZE, reg; \ + andl %esp, reg + +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg + +#endif + +/* + * thread information flags + * - these are process state flags that various + * assembly files may need to access + * - pending work-to-be-done flags are in LSW + * - other flags in MSW + */ +#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_SIGPENDING 1 /* signal pending */ +#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_SINGLESTEP 3 /* restore singlestep on return to + user mode */ +#define TIF_IRET 4 /* return with iret */ +#define TIF_SYSCALL_EMU 5 /* syscall emulation active */ +#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ +#define TIF_SECCOMP 7 /* secure computing */ +#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ +#define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ +#define TIF_MEMDIE 16 +#define TIF_DEBUG 17 /* uses debug registers */ +#define TIF_IO_BITMAP 18 /* uses I/O bitmap */ +#define TIF_FREEZE 19 /* is freezing for suspend */ +#define TIF_NOTSC 20 /* TSC is not accessible in userland */ +#define TIF_FORCED_TF 21 /* true if TF in eflags artificially */ +#define TIF_DEBUGCTLMSR 22 /* uses thread_struct.debugctlmsr */ +#define TIF_DS_AREA_MSR 23 /* uses thread_struct.ds_area_msr */ +#define TIF_BTS_TRACE_TS 24 /* record scheduling event timestamps */ + +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_IRET (1 << TIF_IRET) +#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) +#define _TIF_DEBUG (1 << TIF_DEBUG) +#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_NOTSC (1 << TIF_NOTSC) +#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) +#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) + +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK \ + (0x0000FFFF & ~(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SYSCALL_EMU)) +/* work to do on any return to u-space */ +#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) + +/* flags to check in __switch_to() */ +#define _TIF_WORK_CTXSW \ + (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \ + _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS) +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG) + + +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_USEDFPU 0x0001 /* FPU was used by this task + this quantum (SMP) */ +#define TS_POLLING 0x0002 /* True if in idle loop + and not sleeping */ + +#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) + +#else /* X86_32 */ + +#include <asm/page.h> +#include <asm/types.h> +#include <asm/pda.h> + +/* + * low level task data that entry.S needs immediate access to + * - this struct should fit entirely inside of one cache line + * - this struct shares the supervisor stack pages + */ +#ifndef __ASSEMBLY__ +struct task_struct; +struct exec_domain; +#include <asm/processor.h> + +struct thread_info { + struct task_struct *task; /* main task structure */ + struct exec_domain *exec_domain; /* execution domain */ + __u32 flags; /* low level flags */ + __u32 status; /* thread synchronous flags */ + __u32 cpu; /* current CPU */ + int preempt_count; /* 0 => preemptable, + <0 => BUG */ + mm_segment_t addr_limit; + struct restart_block restart_block; +#ifdef CONFIG_IA32_EMULATION + void __user *sysenter_return; +#endif +}; +#endif + +/* + * macros/functions for gaining access to the thread information structure + * preempt_count needs to be 1 initially, until the scheduler is functional. + */ +#ifndef __ASSEMBLY__ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ +} + +#define init_thread_info (init_thread_union.thread_info) +#define init_stack (init_thread_union.stack) + +static inline struct thread_info *current_thread_info(void) +{ + struct thread_info *ti; + ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE); + return ti; +} + +/* do not use in interrupt context */ +static inline struct thread_info *stack_thread_info(void) +{ + struct thread_info *ti; + asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); + return ti; +} + +/* thread information allocation */ +#ifdef CONFIG_DEBUG_STACK_USAGE +#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) +#else +#define THREAD_FLAGS GFP_KERNEL +#endif + +#define alloc_thread_info(tsk) \ + ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) + +#else /* !__ASSEMBLY__ */ + +/* how to get the thread information struct from ASM */ +#define GET_THREAD_INFO(reg) \ + movq %gs:pda_kernelstack,reg ; \ + subq $(THREAD_SIZE-PDA_STACKOFFSET),reg + +#endif + +/* + * thread information flags + * - these are process state flags that various assembly files + * may need to access + * - pending work-to-be-done flags are in LSW + * - other flags in MSW + * Warning: layout of LSW is hardcoded in entry.S + */ +#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_SIGPENDING 2 /* signal pending */ +#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_IRET 5 /* force IRET */ +#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ +#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ +/* 16 free */ +#define TIF_IA32 17 /* 32bit process */ +#define TIF_FORK 18 /* ret_from_fork */ +#define TIF_ABI_PENDING 19 +#define TIF_MEMDIE 20 +#define TIF_DEBUG 21 /* uses debug registers */ +#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_FREEZE 23 /* is freezing for suspend */ +#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ +#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ +#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ +#define TIF_NOTSC 28 /* TSC is not accessible in userland */ + +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_IRET (1 << TIF_IRET) +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) +#define _TIF_IA32 (1 << TIF_IA32) +#define _TIF_FORK (1 << TIF_FORK) +#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) +#define _TIF_DEBUG (1 << TIF_DEBUG) +#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) +#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +#define _TIF_NOTSC (1 << TIF_NOTSC) + +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK \ + (0x0000FFFF & \ + ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP)) +/* work to do on any return to user space */ +#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) + +#define _TIF_DO_NOTIFY_MASK \ + (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) + +/* flags to check in __switch_to() */ +#define _TIF_WORK_CTXSW \ + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|_TIF_NOTSC) +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) + +#define PREEMPT_ACTIVE 0x10000000 + +/* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_USEDFPU 0x0001 /* FPU was used by this task + this quantum (SMP) */ +#define TS_COMPAT 0x0002 /* 32bit syscall active */ +#define TS_POLLING 0x0004 /* true if in idle loop + and not sleeping */ + +#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) + +#endif /* !X86_32 */ + + #ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); extern void free_thread_info(struct thread_info *ti); diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h deleted file mode 100644 index 5318599..0000000 --- a/include/asm-x86/thread_info_32.h +++ /dev/null @@ -1,196 +0,0 @@ -/* thread_info.h: i386 low-level thread information - * - * Copyright (C) 2002 David Howells (dhowells@redhat.com) - * - Incorporating suggestions made by Linus Torvalds and Dave Miller - */ - -#ifndef _ASM_THREAD_INFO_H -#define _ASM_THREAD_INFO_H - -#ifdef __KERNEL__ - -#include <linux/compiler.h> -#include <asm/page.h> - -#ifndef __ASSEMBLY__ -#include <asm/processor.h> -#endif - -/* - * low level task data that entry.S needs immediate access to - * - this struct should fit entirely inside of one cache line - * - this struct shares the supervisor stack pages - * - if the contents of this structure are changed, - * the assembly constants must also be changed - */ -#ifndef __ASSEMBLY__ - -struct thread_info { - struct task_struct *task; /* main task structure */ - struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ - unsigned long status; /* thread-synchronous flags */ - __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ - mm_segment_t addr_limit; /* thread address space: - 0-0xBFFFFFFF user-thread - 0-0xFFFFFFFF kernel-thread - */ - void *sysenter_return; - struct restart_block restart_block; - unsigned long previous_esp; /* ESP of the previous stack in - case of nested (IRQ) stacks - */ - __u8 supervisor_stack[0]; -}; - -#else /* !__ASSEMBLY__ */ - -#include <asm/asm-offsets.h> - -#endif - -#define PREEMPT_ACTIVE 0x10000000 -#ifdef CONFIG_4KSTACKS -#define THREAD_SIZE (4096) -#else -#define THREAD_SIZE (8192) -#endif - -#define STACK_WARN (THREAD_SIZE/8) -/* - * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. - */ -#ifndef __ASSEMBLY__ - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ -} - -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - - -/* how to get the current stack pointer from C */ -register unsigned long current_stack_pointer asm("esp") __used; - -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); -} - -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(THREAD_SIZE))) -#else -#define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE))) -#endif - -#else /* !__ASSEMBLY__ */ - -/* how to get the thread information struct from ASM */ -#define GET_THREAD_INFO(reg) \ - movl $-THREAD_SIZE, reg; \ - andl %esp, reg - -/* use this one if reg already contains %esp */ -#define GET_THREAD_INFO_WITH_ESP(reg) \ - andl $-THREAD_SIZE, reg - -#endif - -/* - * thread information flags - * - these are process state flags that various - * assembly files may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_SINGLESTEP 3 /* restore singlestep on return to - user mode */ -#define TIF_IRET 4 /* return with iret */ -#define TIF_SYSCALL_EMU 5 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SECCOMP 7 /* secure computing */ -#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ -#define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ -#define TIF_MEMDIE 16 -#define TIF_DEBUG 17 /* uses debug registers */ -#define TIF_IO_BITMAP 18 /* uses I/O bitmap */ -#define TIF_FREEZE 19 /* is freezing for suspend */ -#define TIF_NOTSC 20 /* TSC is not accessible in userland */ -#define TIF_FORCED_TF 21 /* true if TF in eflags artificially */ -#define TIF_DEBUGCTLMSR 22 /* uses thread_struct.debugctlmsr */ -#define TIF_DS_AREA_MSR 23 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 24 /* record scheduling event timestamps */ - -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_IRET (1 << TIF_IRET) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) -#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) -#define _TIF_DEBUG (1 << TIF_DEBUG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) -#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & ~(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SYSCALL_EMU)) -/* work to do on any return to u-space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) - -/* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \ - _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG) - - -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ -#define TS_POLLING 0x0002 /* True if in idle loop - and not sleeping */ - -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - -#endif /* __KERNEL__ */ - -#endif /* _ASM_THREAD_INFO_H */ diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h deleted file mode 100644 index ed664e8..0000000 --- a/include/asm-x86/thread_info_64.h +++ /dev/null @@ -1,186 +0,0 @@ -/* thread_info.h: x86_64 low-level thread information - * - * Copyright (C) 2002 David Howells (dhowells@redhat.com) - * - Incorporating suggestions made by Linus Torvalds and Dave Miller - */ - -#ifndef _ASM_THREAD_INFO_H -#define _ASM_THREAD_INFO_H - -#ifdef __KERNEL__ - -#include <asm/page.h> -#include <asm/types.h> -#include <asm/pda.h> - -/* - * low level task data that entry.S needs immediate access to - * - this struct should fit entirely inside of one cache line - * - this struct shares the supervisor stack pages - */ -#ifndef __ASSEMBLY__ -struct task_struct; -struct exec_domain; -#include <asm/processor.h> - -struct thread_info { - struct task_struct *task; /* main task structure */ - struct exec_domain *exec_domain; /* execution domain */ - __u32 flags; /* low level flags */ - __u32 status; /* thread synchronous flags */ - __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ - mm_segment_t addr_limit; - struct restart_block restart_block; -#ifdef CONFIG_IA32_EMULATION - void __user *sysenter_return; -#endif -}; -#endif - -/* - * macros/functions for gaining access to the thread information structure - * preempt_count needs to be 1 initially, until the scheduler is functional. - */ -#ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ -} - -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - -static inline struct thread_info *current_thread_info(void) -{ - struct thread_info *ti; - ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE); - return ti; -} - -/* do not use in interrupt context */ -static inline struct thread_info *stack_thread_info(void) -{ - struct thread_info *ti; - asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); - return ti; -} - -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) -#else -#define THREAD_FLAGS GFP_KERNEL -#endif - -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) - -#else /* !__ASSEMBLY__ */ - -/* how to get the thread information struct from ASM */ -#define GET_THREAD_INFO(reg) \ - movq %gs:pda_kernelstack,reg ; \ - subq $(THREAD_SIZE-PDA_STACKOFFSET),reg - -#endif - -/* - * thread information flags - * - these are process state flags that various assembly files - * may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - * Warning: layout of LSW is hardcoded in entry.S - */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_IRET 5 /* force IRET */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ -#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ -#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ -#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ -/* 16 free */ -#define TIF_IA32 17 /* 32bit process */ -#define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 -#define TIF_MEMDIE 20 -#define TIF_DEBUG 21 /* uses debug registers */ -#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -#define TIF_FREEZE 23 /* is freezing for suspend */ -#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ -#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ -#define TIF_NOTSC 28 /* TSC is not accessible in userland */ - -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_IRET (1 << TIF_IRET) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) -#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) -#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) -#define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) -#define _TIF_DEBUG (1 << TIF_DEBUG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) -#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) -#define _TIF_NOTSC (1 << TIF_NOTSC) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP)) -/* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) - -#define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) - -/* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|_TIF_NOTSC) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) - -#define PREEMPT_ACTIVE 0x10000000 - -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ -#define TS_COMPAT 0x0002 /* 32bit syscall active */ -#define TS_POLLING 0x0004 /* true if in idle loop - and not sleeping */ - -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - -#endif /* __KERNEL__ */ - -#endif /* _ASM_THREAD_INFO_H */ -- cgit v1.1 From 12a638e13c68bbe187782518dab375f4fa800fc4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:33 -0700 Subject: x86: threadinfo: common include files Move shared includes to a common area in thread_info.h Adds asm/types.h for x86_64 and linux/compiler.h for x86_32. Not needed but we can avoid some ifdeffing and it simplifies later joining. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index d59384a..d2dc1a3 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -7,9 +7,11 @@ #ifndef _ASM_X86_THREAD_INFO_H #define _ASM_X86_THREAD_INFO_H -#ifdef CONFIG_X86_32 #include <linux/compiler.h> #include <asm/page.h> +#include <asm/types.h> + +#ifdef CONFIG_X86_32 #ifndef __ASSEMBLY__ #include <asm/processor.h> @@ -192,8 +194,6 @@ static inline struct thread_info *current_thread_info(void) #else /* X86_32 */ -#include <asm/page.h> -#include <asm/types.h> #include <asm/pda.h> /* -- cgit v1.1 From f2ea3b1d4d7ab66d86da57899993282f3deb1f74 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:34 -0700 Subject: x86: threadinfo: merge thread sync state definitions Merge both. x86_64 has an additional TS_COMPAT that is harmless for 32 bit. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index d2dc1a3..4b91f59 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -177,21 +177,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG) - -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ -#define TS_POLLING 0x0002 /* True if in idle loop - and not sleeping */ - -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #else /* X86_32 */ #include <asm/pda.h> @@ -349,6 +334,8 @@ static inline struct thread_info *stack_thread_info(void) #define PREEMPT_ACTIVE 0x10000000 +#endif /* !X86_32 */ + /* * Thread-synchronous status. * @@ -358,15 +345,12 @@ static inline struct thread_info *stack_thread_info(void) */ #define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ -#define TS_COMPAT 0x0002 /* 32bit syscall active */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */ #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) -#endif /* !X86_32 */ - - #ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); extern void free_thread_info(struct thread_info *ti); -- cgit v1.1 From 006c484bb3d9547e82a33a09668c9b54b912c8fb Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:35 -0700 Subject: x86: common thread_info definitions Merge the thread_info definition into one structure definition for both arches. The __u32 is equal to unsigned long for 32 bit. sysenter_return is used both for the IA32 emulation for 64 and x86_32. Avoid complicated #ifdef by simply always including it. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 55 ++++++++++--------------------------------- 1 file changed, 12 insertions(+), 43 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 4b91f59..71b0880 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -11,47 +11,42 @@ #include <asm/page.h> #include <asm/types.h> -#ifdef CONFIG_X86_32 - -#ifndef __ASSEMBLY__ -#include <asm/processor.h> -#endif - /* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages - * - if the contents of this structure are changed, - * the assembly constants must also be changed */ #ifndef __ASSEMBLY__ +struct task_struct; +struct exec_domain; +#include <asm/processor.h> struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ - unsigned long status; /* thread-synchronous flags */ + __u32 flags; /* low level flags */ + __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, + int preempt_count; /* 0 => preemptable, <0 => BUG */ - mm_segment_t addr_limit; /* thread address space: - 0-0xBFFFFFFF user-thread - 0-0xFFFFFFFF kernel-thread - */ - void *sysenter_return; + mm_segment_t addr_limit; struct restart_block restart_block; + void __user *sysenter_return; +#ifdef CONFIG_X86_32 unsigned long previous_esp; /* ESP of the previous stack in case of nested (IRQ) stacks */ __u8 supervisor_stack[0]; +#endif }; - #else /* !__ASSEMBLY__ */ #include <asm/asm-offsets.h> #endif +#ifdef CONFIG_X86_32 + #define PREEMPT_ACTIVE 0x10000000 #ifdef CONFIG_4KSTACKS #define THREAD_SIZE (4096) @@ -182,32 +177,6 @@ static inline struct thread_info *current_thread_info(void) #include <asm/pda.h> /* - * low level task data that entry.S needs immediate access to - * - this struct should fit entirely inside of one cache line - * - this struct shares the supervisor stack pages - */ -#ifndef __ASSEMBLY__ -struct task_struct; -struct exec_domain; -#include <asm/processor.h> - -struct thread_info { - struct task_struct *task; /* main task structure */ - struct exec_domain *exec_domain; /* execution domain */ - __u32 flags; /* low level flags */ - __u32 status; /* thread synchronous flags */ - __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ - mm_segment_t addr_limit; - struct restart_block restart_block; -#ifdef CONFIG_IA32_EMULATION - void __user *sysenter_return; -#endif -}; -#endif - -/* * macros/functions for gaining access to the thread information structure * preempt_count needs to be 1 initially, until the scheduler is functional. */ -- cgit v1.1 From 3351cc03c0762353225a79507e38db4c1e656d52 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:36 -0700 Subject: x86: threadinfo: merge INIT_THREAD_INFO Both definitions are the same. So move to common x86 area. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 49 +++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 71b0880..8cd52d4 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -39,6 +39,23 @@ struct thread_info { __u8 supervisor_stack[0]; #endif }; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ +} + +#define init_thread_info (init_thread_union.thread_info) +#define init_stack (init_thread_union.stack) + #else /* !__ASSEMBLY__ */ #include <asm/asm-offsets.h> @@ -62,22 +79,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ -} - -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the current stack pointer from C */ register unsigned long current_stack_pointer asm("esp") __used; @@ -181,22 +182,6 @@ static inline struct thread_info *current_thread_info(void) * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ -} - -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; -- cgit v1.1 From 24e2de6e28a453cd114b06215df2f9931cd0c342 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:37 -0700 Subject: x86: thread_info: PREEMPT_ACTIVE Same for both 32 and 64 bit so simply put it in to the common area. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 8cd52d4..f8d5cf5 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -62,9 +62,10 @@ struct thread_info { #endif +#define PREEMPT_ACTIVE 0x10000000 + #ifdef CONFIG_X86_32 -#define PREEMPT_ACTIVE 0x10000000 #ifdef CONFIG_4KSTACKS #define THREAD_SIZE (4096) #else @@ -286,8 +287,6 @@ static inline struct thread_info *stack_thread_info(void) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) -#define PREEMPT_ACTIVE 0x10000000 - #endif /* !X86_32 */ /* -- cgit v1.1 From e57549b017552f7a493b366f5ccd4781801083e4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:38 -0700 Subject: x86: thread_info: merge TIF_ flags. Both TIF lists are essentially the same. x86_32 also has TIF_SYSCALL_EMU which must be undefined for the 64 bit case. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 162 ++++++++++++++++-------------------------- 1 file changed, 61 insertions(+), 101 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index f8d5cf5..983743a 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -62,6 +62,67 @@ struct thread_info { #endif +/* + * thread information flags + * - these are process state flags that various assembly files + * may need to access + * - pending work-to-be-done flags are in LSW + * - other flags in MSW + * Warning: layout of LSW is hardcoded in entry.S + */ +#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_SIGPENDING 2 /* signal pending */ +#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_IRET 5 /* force IRET */ +#ifdef CONFIG_X86_32 +#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ +#endif +#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ +#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ +#define TIF_NOTSC 16 /* TSC is not accessible in userland */ +#define TIF_IA32 17 /* 32bit process */ +#define TIF_FORK 18 /* ret_from_fork */ +#define TIF_ABI_PENDING 19 +#define TIF_MEMDIE 20 +#define TIF_DEBUG 21 /* uses debug registers */ +#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_FREEZE 23 /* is freezing for suspend */ +#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ +#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ +#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ + +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_IRET (1 << TIF_IRET) +#ifdef CONFIG_X86_32 +#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) +#else +#define _TIF_SYSCALL_EMU 0 +#endif +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) +#define _TIF_NOTSC (1 << TIF_NOTSC) +#define _TIF_IA32 (1 << TIF_IA32) +#define _TIF_FORK (1 << TIF_FORK) +#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) +#define _TIF_DEBUG (1 << TIF_DEBUG) +#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) +#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) + #define PREEMPT_ACTIVE 0x10000000 #ifdef CONFIG_X86_32 @@ -113,53 +174,6 @@ static inline struct thread_info *current_thread_info(void) #endif -/* - * thread information flags - * - these are process state flags that various - * assembly files may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_SINGLESTEP 3 /* restore singlestep on return to - user mode */ -#define TIF_IRET 4 /* return with iret */ -#define TIF_SYSCALL_EMU 5 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SECCOMP 7 /* secure computing */ -#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */ -#define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */ -#define TIF_MEMDIE 16 -#define TIF_DEBUG 17 /* uses debug registers */ -#define TIF_IO_BITMAP 18 /* uses I/O bitmap */ -#define TIF_FREEZE 19 /* is freezing for suspend */ -#define TIF_NOTSC 20 /* TSC is not accessible in userland */ -#define TIF_FORCED_TF 21 /* true if TF in eflags artificially */ -#define TIF_DEBUGCTLMSR 22 /* uses thread_struct.debugctlmsr */ -#define TIF_DS_AREA_MSR 23 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 24 /* record scheduling event timestamps */ - -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_IRET (1 << TIF_IRET) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) -#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) -#define _TIF_DEBUG (1 << TIF_DEBUG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) -#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) - /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ (0x0000FFFF & ~(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ @@ -217,60 +231,6 @@ static inline struct thread_info *stack_thread_info(void) #endif -/* - * thread information flags - * - these are process state flags that various assembly files - * may need to access - * - pending work-to-be-done flags are in LSW - * - other flags in MSW - * Warning: layout of LSW is hardcoded in entry.S - */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_IRET 5 /* force IRET */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ -#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ -#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ -#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ -/* 16 free */ -#define TIF_IA32 17 /* 32bit process */ -#define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 -#define TIF_MEMDIE 20 -#define TIF_DEBUG 21 /* uses debug registers */ -#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -#define TIF_FREEZE 23 /* is freezing for suspend */ -#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ -#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ -#define TIF_NOTSC 28 /* TSC is not accessible in userland */ - -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_IRET (1 << TIF_IRET) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) -#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) -#define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) -#define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) -#define _TIF_DEBUG (1 << TIF_DEBUG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) -#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) -#define _TIF_NOTSC (1 << TIF_NOTSC) - /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ (0x0000FFFF & \ -- cgit v1.1 From 00c1bb133cf351fa3904b00a48a9cf535d018de6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:39 -0700 Subject: x86: thread_info: merge tif masks The TIF masks are basically the same. x86_32 also has _TIF_SYSCALL_EMU which is zero for the 64 bit case. The tif masks become the same. x86_64 has an additional _TIF_DONOTIFY_MASK. Does not hurt for the 32 bit case since it is only used in x86_64 arch code. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 51 ++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 983743a..b7cd413 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -123,6 +123,27 @@ struct thread_info { #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +/* work to do on interrupt/exception return */ +#define _TIF_WORK_MASK \ + (0x0000FFFF & \ + ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP| \ + _TIF_SECCOMP|_TIF_SYSCALL_EMU)) + +/* work to do on any return to user space */ +#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) + +#define _TIF_DO_NOTIFY_MASK \ + (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) + +/* flags to check in __switch_to() */ +#define _TIF_WORK_CTXSW \ + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ + _TIF_NOTSC) + +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) + + #define PREEMPT_ACTIVE 0x10000000 #ifdef CONFIG_X86_32 @@ -174,20 +195,6 @@ static inline struct thread_info *current_thread_info(void) #endif -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & ~(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SYSCALL_EMU)) -/* work to do on any return to u-space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) - -/* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \ - _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG) - #else /* X86_32 */ #include <asm/pda.h> @@ -231,22 +238,6 @@ static inline struct thread_info *stack_thread_info(void) #endif -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP)) -/* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) - -#define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) - -/* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS|_TIF_NOTSC) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) - #endif /* !X86_32 */ /* -- cgit v1.1 From b84200b3a0fafa167185201319940d8df62a8c7b Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Mon, 28 Apr 2008 18:52:40 -0700 Subject: x86: thread_info: merge thread_info allocation Make them similar so that both use THREAD_ORDER and THREAD_FLAGS and have a THREAD_SIZE definition that is setup in asm/page_xx.h Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/page_32.h | 8 ++++++++ include/asm-x86/thread_info.h | 37 +++++++++++-------------------------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 424e82f..50b33eb 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -13,6 +13,14 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#ifdef CONFIG_4KSTACKS +#define THREAD_ORDER 0 +#else +#define THREAD_ORDER 1 +#endif +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) + + #ifdef CONFIG_X86_PAE #define __PHYSICAL_MASK_SHIFT 36 #define __VIRTUAL_MASK_SHIFT 32 diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index b7cd413..348f0e0 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -132,6 +132,7 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) +/* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED) @@ -143,18 +144,21 @@ struct thread_info { #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) - #define PREEMPT_ACTIVE 0x10000000 -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_4KSTACKS -#define THREAD_SIZE (4096) +/* thread information allocation */ +#ifdef CONFIG_DEBUG_STACK_USAGE +#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) #else -#define THREAD_SIZE (8192) +#define THREAD_FLAGS GFP_KERNEL #endif -#define STACK_WARN (THREAD_SIZE/8) +#define alloc_thread_info(tsk) \ + ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) + +#ifdef CONFIG_X86_32 + +#define STACK_WARN (THREAD_SIZE/8) /* * macros/functions for gaining access to the thread information structure * @@ -173,15 +177,6 @@ static inline struct thread_info *current_thread_info(void) (current_stack_pointer & ~(THREAD_SIZE - 1)); } -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(THREAD_SIZE))) -#else -#define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE))) -#endif - #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ @@ -219,16 +214,6 @@ static inline struct thread_info *stack_thread_info(void) return ti; } -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) -#else -#define THREAD_FLAGS GFP_KERNEL -#endif - -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) - #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ -- cgit v1.1 From 8a6c160a2a13d82c75a50af7282b906cce948df5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 30 Apr 2008 22:13:44 +0200 Subject: x86: redo thread_info.h change redo Roland's "signals: x86 TS_RESTORE_SIGMASK" ontop of the unified thread_info.h file. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/thread_info.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 348f0e0..74481b7 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -80,7 +80,6 @@ struct thread_info { #endif #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -108,7 +107,6 @@ struct thread_info { #endif #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_HRTICK_RESCHED (1 << TIF_HRTICK_RESCHED) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -237,10 +235,21 @@ static inline struct thread_info *stack_thread_info(void) #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */ +#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) #ifndef __ASSEMBLY__ +#define HAVE_SET_RESTORE_SIGMASK 1 +static inline void set_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + ti->status |= TS_RESTORE_SIGMASK; + set_bit(TIF_SIGPENDING, &ti->flags); +} +#endif /* !__ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); extern void free_thread_info(struct thread_info *ti); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -- cgit v1.1 From 41b3eae669fb1ef6ae4acaa937b4e4617a1aa078 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 22 Apr 2008 16:29:26 +0100 Subject: x86: minor polishing to top-level arch Makefile Use build target when creating compatibility link, and use $(boot) where possible. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3cff3c8..5df0d1e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -210,12 +210,12 @@ all: bzImage # KBUILD_IMAGE specify target image being built KBUILD_IMAGE := $(boot)/bzImage -zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage +zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage zImage bzImage: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot - $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage + $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ compressed: zImage -- cgit v1.1 From 6859a8402945cf1d74af75a2e1aa4e327a506ab4 Mon Sep 17 00:00:00 2001 From: Alan Mayer <ajm@sgi.com> Date: Wed, 26 Mar 2008 16:11:31 -0500 Subject: x86: resize NR_IRQS for large machines On machines with very large numbers of cpus, tables that are dimensioned by NR_IRQS get very large, especially the irq_desc table. They are also very sparsely used. When the cpu count is > MAX_IO_APICS, use MAX_IO_APICS to set NR_IRQS, otherwise use NR_CPUS. Signed-off-by: Alan Mayer <ajm@sgi.com> Reviewed-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/irq_64.h | 6 ++++++ include/linux/kernel_stat.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h index 083d35a..7608176 100644 --- a/include/asm-x86/irq_64.h +++ b/include/asm-x86/irq_64.h @@ -10,6 +10,8 @@ * <tomsoft@informatik.tu-chemnitz.de> */ +#include <asm/apicdef.h> + #define TIMER_IRQ 0 /* @@ -31,7 +33,11 @@ #define FIRST_SYSTEM_VECTOR 0xef /* duplicated in hw_irq.h */ +#if NR_CPUS < MAX_IO_APICS #define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) +#else +#define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +#endif #define NR_IRQ_VECTORS NR_IRQS static inline int irq_canonicalize(int irq) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index e8ffce8..cf9f40a 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -1,11 +1,11 @@ #ifndef _LINUX_KERNEL_STAT_H #define _LINUX_KERNEL_STAT_H -#include <asm/irq.h> #include <linux/smp.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/cpumask.h> +#include <asm/irq.h> #include <asm/cputime.h> /* -- cgit v1.1 From 2e0884362d1fe36ef2d673d763d6ce35e2044e66 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 19:00:30 +0200 Subject: x86: move common declarations to hw_irq.h Move the common declarations from hw_irq_32/64 into hw_irq.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/hw_irq.h | 81 +++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/hw_irq_32.h | 61 ---------------------------------- include/asm-x86/hw_irq_64.h | 68 ------------------------------------- 3 files changed, 81 insertions(+), 129 deletions(-) diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index bf02539..38af08e 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -1,5 +1,86 @@ +#ifndef _ASM_HW_IRQ_H +#define _ASM_HW_IRQ_H + +/* + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * moved some of the old arch/i386/kernel/irq.h to here. VY + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@informatik.tu-chemnitz.de> + * + * hacked by Andi Kleen for x86-64. + * unified by tglx + */ + +#define NMI_VECTOR 0x02 + +#ifndef __ASSEMBLY__ + +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/smp.h> + +#include <asm/atomic.h> +#include <asm/irq.h> +#include <asm/sections.h> + +#define platform_legacy_irq(irq) ((irq) < 16) + +/* Interrupt handlers registered during init_IRQ */ +extern void apic_timer_interrupt(void); +extern void error_interrupt(void); +extern void spurious_interrupt(void); +extern void thermal_interrupt(void); +extern void reschedule_interrupt(void); + +extern void invalidate_interrupt(void); +extern void invalidate_interrupt0(void); +extern void invalidate_interrupt1(void); +extern void invalidate_interrupt2(void); +extern void invalidate_interrupt3(void); +extern void invalidate_interrupt4(void); +extern void invalidate_interrupt5(void); +extern void invalidate_interrupt6(void); +extern void invalidate_interrupt7(void); + +extern void irq_move_cleanup_interrupt(void); +extern void threshold_interrupt(void); + +extern void call_function_interrupt(void); + +/* PIC specific functions */ +extern void disable_8259A_irq(unsigned int irq); +extern void enable_8259A_irq(unsigned int irq); +extern int i8259A_irq_pending(unsigned int irq); +extern void make_8259A_irq(unsigned int irq); +extern void init_8259A(int aeoi); + +/* IOAPIC */ +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) +extern unsigned long io_apic_irqs; + +extern void init_VISWS_APIC_irqs(void); +extern void setup_IO_APIC(void); +extern void disable_IO_APIC(void); +extern void print_IO_APIC(void); +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +extern void setup_ioapic_dest(void); + +/* IPI functions */ +extern void send_IPI_self(int vector); +extern void send_IPI(int dest, int vector); + +/* Statistics */ +extern atomic_t irq_err_count; +extern atomic_t irq_mis_count; + +#endif /* !ASSEMBLY_ */ + #ifdef CONFIG_X86_32 # include "hw_irq_32.h" #else # include "hw_irq_64.h" #endif + +#endif diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h index ea88054..89fca5a 100644 --- a/include/asm-x86/hw_irq_32.h +++ b/include/asm-x86/hw_irq_32.h @@ -1,66 +1,5 @@ -#ifndef _ASM_HW_IRQ_H -#define _ASM_HW_IRQ_H - -/* - * linux/include/asm/hw_irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * moved some of the old arch/i386/kernel/irq.h to here. VY - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@informatik.tu-chemnitz.de> - */ - -#include <linux/profile.h> -#include <asm/atomic.h> -#include <asm/irq.h> -#include <asm/sections.h> - -#define NMI_VECTOR 0x02 - -/* - * Various low-level irq details needed by irq.c, process.c, - * time.c, io_apic.c and smp.c - * - * Interrupt entry/exit code at both C and assembly level - */ extern void (*const interrupt[NR_IRQS])(void); -#ifdef CONFIG_SMP -void reschedule_interrupt(void); -void invalidate_interrupt(void); -void call_function_interrupt(void); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -void apic_timer_interrupt(void); -void error_interrupt(void); -void spurious_interrupt(void); -void thermal_interrupt(void); -#define platform_legacy_irq(irq) ((irq) < 16) -#endif - -void disable_8259A_irq(unsigned int irq); -void enable_8259A_irq(unsigned int irq); -int i8259A_irq_pending(unsigned int irq); -void make_8259A_irq(unsigned int irq); -void init_8259A(int aeoi); -void send_IPI_self(int vector); -void init_VISWS_APIC_irqs(void); -void setup_IO_APIC(void); -void disable_IO_APIC(void); -void print_IO_APIC(void); -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); -void send_IPI(int dest, int vector); -void setup_ioapic_dest(void); - -extern unsigned long io_apic_irqs; - -extern atomic_t irq_err_count; -extern atomic_t irq_mis_count; -#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) -#endif /* _ASM_HW_IRQ_H */ diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 0062ef3..2867457 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -1,28 +1,3 @@ -#ifndef _ASM_HW_IRQ_H -#define _ASM_HW_IRQ_H - -/* - * linux/include/asm/hw_irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * moved some of the old arch/i386/kernel/irq.h to here. VY - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@informatik.tu-chemnitz.de> - * - * hacked by Andi Kleen for x86-64. - */ - -#ifndef __ASSEMBLY__ -#include <asm/atomic.h> -#include <asm/irq.h> -#include <linux/profile.h> -#include <linux/smp.h> -#include <linux/percpu.h> -#endif - -#define NMI_VECTOR 0x02 /* * IDT vectors usable for external interrupt sources start * at 0x20: @@ -96,25 +71,6 @@ #ifndef __ASSEMBLY__ -/* Interrupt handlers registered during init_IRQ */ -void apic_timer_interrupt(void); -void spurious_interrupt(void); -void error_interrupt(void); -void reschedule_interrupt(void); -void call_function_interrupt(void); -void irq_move_cleanup_interrupt(void); -void invalidate_interrupt0(void); -void invalidate_interrupt1(void); -void invalidate_interrupt2(void); -void invalidate_interrupt3(void); -void invalidate_interrupt4(void); -void invalidate_interrupt5(void); -void invalidate_interrupt6(void); -void invalidate_interrupt7(void); -void thermal_interrupt(void); -void threshold_interrupt(void); -void i8254_timer_resume(void); - typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern void __setup_vector_irq(int cpu); @@ -127,29 +83,9 @@ extern spinlock_t vector_lock; * Interrupt entry/exit code at both C and assembly level */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); -extern void send_IPI_self(int vector); -extern void init_VISWS_APIC_irqs(void); -extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); -extern void disable_IO_APIC(void); -extern void print_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); -extern void send_IPI(int dest, int vector); -extern void setup_ioapic_dest(void); extern void native_init_IRQ(void); -extern unsigned long io_apic_irqs; - -extern atomic_t irq_err_count; -extern atomic_t irq_mis_count; - -#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) - #include <asm/ptrace.h> #define IRQ_NAME2(nr) nr##_interrupt(void) @@ -166,8 +102,4 @@ extern atomic_t irq_mis_count; "push $~(" #nr ") ; " \ "jmp common_interrupt"); -#define platform_legacy_irq(irq) ((irq) < 16) - #endif - -#endif /* _ASM_HW_IRQ_H */ -- cgit v1.1 From 9b7dc567d03d74a1fbae84e88949b6a60d922d82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 20:10:09 +0200 Subject: x86: unify interrupt vector defines The interrupt vector defines are copied 4 times around with minimal differences. Move them all into asm-x86/irq_vectors.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/vmiclock_32.c | 3 +- arch/x86/mach-visws/visws_apic.c | 3 +- include/asm-x86/hw_irq.h | 12 +- include/asm-x86/hw_irq_64.h | 71 --------- include/asm-x86/irq_32.h | 2 +- include/asm-x86/irq_64.h | 29 +--- include/asm-x86/irq_vectors.h | 168 ++++++++++++++++++++++ include/asm-x86/mach-default/irq_vectors.h | 96 ------------- include/asm-x86/mach-default/irq_vectors_limits.h | 16 --- include/asm-x86/mach-visws/irq_vectors.h | 62 -------- include/asm-x86/mach-voyager/irq_vectors.h | 79 ---------- 12 files changed, 184 insertions(+), 359 deletions(-) create mode 100644 include/asm-x86/irq_vectors.h delete mode 100644 include/asm-x86/mach-default/irq_vectors.h delete mode 100644 include/asm-x86/mach-default/irq_vectors_limits.h delete mode 100644 include/asm-x86/mach-visws/irq_vectors.h delete mode 100644 include/asm-x86/mach-voyager/irq_vectors.h diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc..0c7f64b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,7 +51,7 @@ #include <asm/percpu.h> #include <asm/dwarf2.h> #include <asm/processor-flags.h> -#include "irq_vectors.h" +#include <asm/irq_vectors.h> /* * We use macros for low-level operations which need to be overridden diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index a2b0307..ba7d19e 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -33,8 +33,7 @@ #include <asm/apic.h> #include <asm/timer.h> #include <asm/i8253.h> - -#include <irq_vectors.h> +#include <asm/irq_vectors.h> #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c index cef9cb1..d8b2cfd 100644 --- a/arch/x86/mach-visws/visws_apic.c +++ b/arch/x86/mach-visws/visws_apic.c @@ -21,10 +21,9 @@ #include <asm/io.h> #include <asm/apic.h> #include <asm/i8259.h> +#include <asm/irq_vectors.h> #include "cobalt.h" -#include "irq_vectors.h" - static DEFINE_SPINLOCK(cobalt_lock); diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 38af08e..a8c5e8b 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -13,7 +13,7 @@ * unified by tglx */ -#define NMI_VECTOR 0x02 +#include <asm/irq_vectors.h> #ifndef __ASSEMBLY__ @@ -75,6 +75,16 @@ extern void send_IPI(int dest, int vector); extern atomic_t irq_err_count; extern atomic_t irq_mis_count; +/* Voyager functions */ +extern asmlinkage void vic_cpi_interrupt(void); +extern asmlinkage void vic_sys_interrupt(void); +extern asmlinkage void vic_cmn_interrupt(void); +extern asmlinkage void qic_timer_interrupt(void); +extern asmlinkage void qic_invalidate_interrupt(void); +extern asmlinkage void qic_reschedule_interrupt(void); +extern asmlinkage void qic_enable_irq_interrupt(void); +extern asmlinkage void qic_call_function_interrupt(void); + #endif /* !ASSEMBLY_ */ #ifdef CONFIG_X86_32 diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 2867457..98c9d49 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -1,74 +1,3 @@ -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define IA32_SYSCALL_VECTOR 0x80 - - -/* Reserve the lowest usable priority level 0x20 - 0x2f for triggering - * cleanup after irq migration. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - -/* - * Vectors 0x30-0x3f are used for ISA interrupts. - */ -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define RESCHEDULE_VECTOR 0xfd -#define CALL_FUNCTION_VECTOR 0xfc -/* fb free - please don't readd KDB here because it's useless - (hint - think what a NMI bit does to a vector) */ -#define THERMAL_APIC_VECTOR 0xfa -#define THRESHOLD_APIC_VECTOR 0xf9 -/* f8 free */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ - -#define NUM_INVALIDATE_TLB_VECTORS 8 - -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x41 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) -#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */ - - #ifndef __ASSEMBLY__ typedef int vector_irq_t[NR_VECTORS]; diff --git a/include/asm-x86/irq_32.h b/include/asm-x86/irq_32.h index 0b79f31..c5c7542 100644 --- a/include/asm-x86/irq_32.h +++ b/include/asm-x86/irq_32.h @@ -12,7 +12,7 @@ #include <linux/sched.h> /* include comes from machine specific directory */ -#include "irq_vectors.h" +#include <asm/irq_vectors.h> #include <asm/thread_info.h> static inline int irq_canonicalize(int irq) diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h index 7608176..3037ec6 100644 --- a/include/asm-x86/irq_64.h +++ b/include/asm-x86/irq_64.h @@ -11,34 +11,7 @@ */ #include <asm/apicdef.h> - -#define TIMER_IRQ 0 - -/* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) - * Since vectors 0x00-0x1f are used/reserved for the CPU, - * the usable vector space is 0x20-0xff (224 vectors) - */ - -/* - * The maximum number of vectors supported by x86_64 processors - * is limited to 256. For processors other than x86_64, NR_VECTORS - * should be changed accordingly. - */ -#define NR_VECTORS 256 - -#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in hw_irq.h */ - -#if NR_CPUS < MAX_IO_APICS -#define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) -#else -#define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) -#endif -#define NR_IRQ_VECTORS NR_IRQS +#include <asm/irq_vectors.h> static inline int irq_canonicalize(int irq) { diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h new file mode 100644 index 0000000..daceaaf --- /dev/null +++ b/include/asm-x86/irq_vectors.h @@ -0,0 +1,168 @@ +#ifndef _ASM_IRQ_VECTORS_H +#define _ASM_IRQ_VECTORS_H + +#include <linux/threads.h> + +#define NMI_VECTOR 0x02 + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +#else +# define IA32_SYSCALL_VECTOR 0x80 +#endif + +/* + * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. + * + * Reserve the lowest usable priority level 0x20 - 0x2f for triggering + * cleanup after irq migration on 64 bit. + */ +#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR + +/* + * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit + */ +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ1_VECTOR (IRQ0_VECTOR + 1) +#define IRQ2_VECTOR (IRQ0_VECTOR + 2) +#define IRQ3_VECTOR (IRQ0_VECTOR + 3) +#define IRQ4_VECTOR (IRQ0_VECTOR + 4) +#define IRQ5_VECTOR (IRQ0_VECTOR + 5) +#define IRQ6_VECTOR (IRQ0_VECTOR + 6) +#define IRQ7_VECTOR (IRQ0_VECTOR + 7) +#define IRQ8_VECTOR (IRQ0_VECTOR + 8) +#define IRQ9_VECTOR (IRQ0_VECTOR + 9) +#define IRQ10_VECTOR (IRQ0_VECTOR + 10) +#define IRQ11_VECTOR (IRQ0_VECTOR + 11) +#define IRQ12_VECTOR (IRQ0_VECTOR + 12) +#define IRQ13_VECTOR (IRQ0_VECTOR + 13) +#define IRQ14_VECTOR (IRQ0_VECTOR + 14) +#define IRQ15_VECTOR (IRQ0_VECTOR + 15) + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +#ifdef CONFIG_X86_32 + +# define SPURIOUS_APIC_VECTOR 0xff +# define ERROR_APIC_VECTOR 0xfe +# define INVALIDATE_TLB_VECTOR 0xfd +# define RESCHEDULE_VECTOR 0xfc +# define CALL_FUNCTION_VECTOR 0xfb +# define THERMAL_APIC_VECTOR 0xf0 + +#else + +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define RESCHEDULE_VECTOR 0xfd +#define CALL_FUNCTION_VECTOR 0xfc +#define THERMAL_APIC_VECTOR 0xfa +#define THRESHOLD_APIC_VECTOR 0xf9 +#define INVALIDATE_TLB_VECTOR_END 0xf7 +#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ + +#define NUM_INVALIDATE_TLB_VECTORS 8 + +#endif + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#ifdef CONFIG_X86_32 +# define FIRST_DEVICE_VECTOR 0x31 +#else +# define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) +#endif + +#define FIRST_SYSTEM_VECTOR 0xef + +#define NR_VECTORS 256 + +#define FPU_IRQ 13 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) + +#if !defined(CONFIG_X86_VISWS) && !defined(CONFIG_X86_VOYAGER) + +# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) + +# define NR_IRQS 224 + +# if (224 >= 32 * NR_CPUS) +# define NR_IRQ_VECTORS NR_IRQS +# else +# define NR_IRQ_VECTORS (32 * NR_CPUS) +# endif + +# else /* IO_APIC || PARAVIRT */ + +# define NR_IRQS 16 +# define NR_IRQ_VECTORS NR_IRQS + +# endif + +#else /* !VISWS && !VOYAGER */ + +# define NR_IRQS 224 +# define NR_IRQ_VECTORS NR_IRQS + +#endif /* VISWS */ + +/* Voyager specific defines */ +/* These define the CPIs we use in linux */ +#define VIC_CPI_LEVEL0 0 +#define VIC_CPI_LEVEL1 1 +/* now the fake CPIs */ +#define VIC_TIMER_CPI 2 +#define VIC_INVALIDATE_CPI 3 +#define VIC_RESCHEDULE_CPI 4 +#define VIC_ENABLE_IRQ_CPI 5 +#define VIC_CALL_FUNCTION_CPI 6 + +/* Now the QIC CPIs: Since we don't need the two initial levels, + * these are 2 less than the VIC CPIs */ +#define QIC_CPI_OFFSET 1 +#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) +#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) +#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) +#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) +#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) + +#define VIC_START_FAKE_CPI VIC_TIMER_CPI +#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_CPI + +/* this is the SYS_INT CPI. */ +#define VIC_SYS_INT 8 +#define VIC_CMN_INT 15 + +/* This is the boot CPI for alternate processors. It gets overwritten + * by the above once the system has activated all available processors */ +#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 +#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) + + +#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-default/irq_vectors.h b/include/asm-x86/mach-default/irq_vectors.h deleted file mode 100644 index 881c63c..0000000 --- a/include/asm-x86/mach-default/irq_vectors.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * This file should contain #defines for all of the interrupt vector - * numbers used by this architecture. - * - * In addition, there are some standard defines: - * - * FIRST_EXTERNAL_VECTOR: - * The first free place for external interrupts - * - * SYSCALL_VECTOR: - * The IRQ vector a syscall makes the user to kernel transition - * under. - * - * TIMER_IRQ: - * The IRQ number the timer interrupt comes in at. - * - * NR_IRQS: - * The total number of interrupt vectors (including all the - * architecture specific interrupts) needed. - * - */ -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - * - * Vectors 0xf0-0xfa are free (reserved for future Linux use). - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define INVALIDATE_TLB_VECTOR 0xfd -#define RESCHEDULE_VECTOR 0xfc -#define CALL_FUNCTION_VECTOR 0xfb - -#define THERMAL_APIC_VECTOR 0xf0 -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef - -#define TIMER_IRQ 0 - -/* - * 16 8259A IRQ's, 208 potential APIC interrupt sources. - * Right now the APIC is mostly only used for SMP. - * 256 vectors is an architectural limit. (we can have - * more than 256 devices theoretically, but they will - * have to use shared interrupts) - * Since vectors 0x00-0x1f are used/reserved for the CPU, - * the usable vector space is 0x20-0xff (224 vectors) - */ - -/* - * The maximum number of vectors supported by i386 processors - * is limited to 256. For processors other than i386, NR_VECTORS - * should be changed accordingly. - */ -#define NR_VECTORS 256 - -#include "irq_vectors_limits.h" - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - - -#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-default/irq_vectors_limits.h b/include/asm-x86/mach-default/irq_vectors_limits.h deleted file mode 100644 index a90c7a6..0000000 --- a/include/asm-x86/mach-default/irq_vectors_limits.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _ASM_IRQ_VECTORS_LIMITS_H -#define _ASM_IRQ_VECTORS_LIMITS_H - -#if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) -#define NR_IRQS 224 -# if (224 >= 32 * NR_CPUS) -# define NR_IRQ_VECTORS NR_IRQS -# else -# define NR_IRQ_VECTORS (32 * NR_CPUS) -# endif -#else -#define NR_IRQS 16 -#define NR_IRQ_VECTORS NR_IRQS -#endif - -#endif /* _ASM_IRQ_VECTORS_LIMITS_H */ diff --git a/include/asm-x86/mach-visws/irq_vectors.h b/include/asm-x86/mach-visws/irq_vectors.h deleted file mode 100644 index cb572d8..0000000 --- a/include/asm-x86/mach-visws/irq_vectors.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* - * Special IRQ vectors used by the SMP architecture, 0xf0-0xff - * - * some of the following vectors are 'rare', they are merged - * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. - * TLB, reschedule and local APIC vectors are performance-critical. - * - * Vectors 0xf0-0xfa are free (reserved for future Linux use). - */ -#define SPURIOUS_APIC_VECTOR 0xff -#define ERROR_APIC_VECTOR 0xfe -#define INVALIDATE_TLB_VECTOR 0xfd -#define RESCHEDULE_VECTOR 0xfc -#define CALL_FUNCTION_VECTOR 0xfb - -#define THERMAL_APIC_VECTOR 0xf0 -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef - -/* - * First APIC vector available to drivers: (vectors 0x30-0xee) - * we start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef - -#define TIMER_IRQ 0 - -/* - * IRQ definitions - */ -#define NR_VECTORS 256 -#define NR_IRQS 224 -#define NR_IRQ_VECTORS NR_IRQS - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - -#endif /* _ASM_IRQ_VECTORS_H */ diff --git a/include/asm-x86/mach-voyager/irq_vectors.h b/include/asm-x86/mach-voyager/irq_vectors.h deleted file mode 100644 index 165421f..0000000 --- a/include/asm-x86/mach-voyager/irq_vectors.h +++ /dev/null @@ -1,79 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* Copyright (C) 2002 - * - * Author: James.Bottomley@HansenPartnership.com - * - * linux/arch/i386/voyager/irq_vectors.h - * - * This file provides definitions for the VIC and QIC CPIs - */ - -#ifndef _ASM_IRQ_VECTORS_H -#define _ASM_IRQ_VECTORS_H - -/* - * IDT vectors usable for external interrupt sources start - * at 0x20: - */ -#define FIRST_EXTERNAL_VECTOR 0x20 - -#define SYSCALL_VECTOR 0x80 - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. - */ - -/* These define the CPIs we use in linux */ -#define VIC_CPI_LEVEL0 0 -#define VIC_CPI_LEVEL1 1 -/* now the fake CPIs */ -#define VIC_TIMER_CPI 2 -#define VIC_INVALIDATE_CPI 3 -#define VIC_RESCHEDULE_CPI 4 -#define VIC_ENABLE_IRQ_CPI 5 -#define VIC_CALL_FUNCTION_CPI 6 - -/* Now the QIC CPIs: Since we don't need the two initial levels, - * these are 2 less than the VIC CPIs */ -#define QIC_CPI_OFFSET 1 -#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) -#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) -#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) -#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) -#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) - -#define VIC_START_FAKE_CPI VIC_TIMER_CPI -#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_CPI - -/* this is the SYS_INT CPI. */ -#define VIC_SYS_INT 8 -#define VIC_CMN_INT 15 - -/* This is the boot CPI for alternate processors. It gets overwritten - * by the above once the system has activated all available processors */ -#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 -#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) - -#define NR_VECTORS 256 -#define NR_IRQS 224 -#define NR_IRQ_VECTORS NR_IRQS - -#define FPU_IRQ 13 - -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 -#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) - -#ifndef __ASSEMBLY__ -extern asmlinkage void vic_cpi_interrupt(void); -extern asmlinkage void vic_sys_interrupt(void); -extern asmlinkage void vic_cmn_interrupt(void); -extern asmlinkage void qic_timer_interrupt(void); -extern asmlinkage void qic_invalidate_interrupt(void); -extern asmlinkage void qic_reschedule_interrupt(void); -extern asmlinkage void qic_enable_irq_interrupt(void); -extern asmlinkage void qic_call_function_interrupt(void); -#endif /* !__ASSEMBLY__ */ - -#endif /* _ASM_IRQ_VECTORS_H */ -- cgit v1.1 From 0bc471d93051a19545257909bc2ed2ad3b389b54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 21:55:12 +0200 Subject: x86: move BUILD_IRQ macro magic to i8259_64.c i8259_64.c is the only place which uses those macros. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/i8259_64.c | 14 ++++++++++++++ include/asm-x86/hw_irq_64.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index fa57a15..c4ae476 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -34,6 +34,20 @@ * interrupt-controller happy. */ +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ + asmlinkage void IRQ_NAME(nr); \ + asm("\n.p2align\n" \ + "IRQ" #nr "_interrupt:\n\t" \ + "push $~(" #nr ") ; " \ + "jmp common_interrupt"); + #define BI(x,y) \ BUILD_IRQ(x##y) diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 98c9d49..9305f74 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -17,18 +17,4 @@ extern void native_init_IRQ(void); #include <asm/ptrace.h> -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt"); - #endif -- cgit v1.1 From 97e7b6f54c0d66586a658e985630cd63040311fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 22:02:25 +0200 Subject: x86: unify apic interrupt function declarations Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/hw_irq.h | 4 ++++ include/asm-x86/hw_irq_64.h | 3 --- include/asm-x86/irq_64.h | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index a8c5e8b..cdb09d7 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -67,6 +67,10 @@ extern void print_IO_APIC(void); extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); extern void setup_ioapic_dest(void); +#ifdef CONFIG_X86_64 +extern void enable_IO_APIC(void); +#endif + /* IPI functions */ extern void send_IPI_self(int vector); extern void send_IPI(int dest, int vector); diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h index 9305f74..428785b 100644 --- a/include/asm-x86/hw_irq_64.h +++ b/include/asm-x86/hw_irq_64.h @@ -12,9 +12,6 @@ extern spinlock_t vector_lock; * Interrupt entry/exit code at both C and assembly level */ -extern void enable_IO_APIC(void); -extern void native_init_IRQ(void); - #include <asm/ptrace.h> #endif diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h index 3037ec6..1ef233d 100644 --- a/include/asm-x86/irq_64.h +++ b/include/asm-x86/irq_64.h @@ -27,4 +27,6 @@ extern void fixup_irqs(cpumask_t map); #define __ARCH_HAS_DO_SOFTIRQ 1 +extern void native_init_IRQ(void); + #endif /* _ASM_IRQ_H */ -- cgit v1.1 From 22dc12d1f694b9af88e616ab758ff90c69d0fc83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 22:10:39 +0200 Subject: x86: unify hwirq.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/hw_irq.h | 11 +++++++---- include/asm-x86/hw_irq_32.h | 5 ----- include/asm-x86/hw_irq_64.h | 17 ----------------- 3 files changed, 7 insertions(+), 26 deletions(-) delete mode 100644 include/asm-x86/hw_irq_32.h delete mode 100644 include/asm-x86/hw_irq_64.h diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index cdb09d7..1db2dff 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -89,12 +89,15 @@ extern asmlinkage void qic_reschedule_interrupt(void); extern asmlinkage void qic_enable_irq_interrupt(void); extern asmlinkage void qic_call_function_interrupt(void); -#endif /* !ASSEMBLY_ */ - #ifdef CONFIG_X86_32 -# include "hw_irq_32.h" +extern void (*const interrupt[NR_IRQS])(void); #else -# include "hw_irq_64.h" +typedef int vector_irq_t[NR_VECTORS]; +DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void __setup_vector_irq(int cpu); +extern spinlock_t vector_lock; #endif +#endif /* !ASSEMBLY_ */ + #endif diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h deleted file mode 100644 index 89fca5a..0000000 --- a/include/asm-x86/hw_irq_32.h +++ /dev/null @@ -1,5 +0,0 @@ - -extern void (*const interrupt[NR_IRQS])(void); - - - diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h deleted file mode 100644 index 428785b..0000000 --- a/include/asm-x86/hw_irq_64.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __ASSEMBLY__ - -typedef int vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); -extern void __setup_vector_irq(int cpu); -extern spinlock_t vector_lock; - -/* - * Various low-level irq details needed by irq.c, process.c, - * time.c, io_apic.c and smp.c - * - * Interrupt entry/exit code at both C and assembly level - */ - -#include <asm/ptrace.h> - -#endif -- cgit v1.1 From 22067d4501bfb47c8ca34144f68fad734178914d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 22:14:44 +0200 Subject: x86: unify irq.h Not much difference in those files. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/irq.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/asm-x86/irq_32.h | 51 ------------------------------------------------ include/asm-x86/irq_64.h | 32 ------------------------------ 3 files changed, 48 insertions(+), 86 deletions(-) delete mode 100644 include/asm-x86/irq_32.h delete mode 100644 include/asm-x86/irq_64.h diff --git a/include/asm-x86/irq.h b/include/asm-x86/irq.h index 7ba9054..1a29257 100644 --- a/include/asm-x86/irq.h +++ b/include/asm-x86/irq.h @@ -1,5 +1,50 @@ -#ifdef CONFIG_X86_32 -# include "irq_32.h" +#ifndef _ASM_IRQ_H +#define _ASM_IRQ_H +/* + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@informatik.tu-chemnitz.de> + */ + +#include <asm/apicdef.h> +#include <asm/irq_vectors.h> + +static inline int irq_canonicalize(int irq) +{ + return ((irq == 2) ? 9 : irq); +} + +#ifdef CONFIG_X86_LOCAL_APIC +# define ARCH_HAS_NMI_WATCHDOG +#endif + +#ifdef CONFIG_4KSTACKS + extern void irq_ctx_init(int cpu); + extern void irq_ctx_exit(int cpu); +# define __ARCH_HAS_DO_SOFTIRQ #else -# include "irq_64.h" +# define irq_ctx_init(cpu) do { } while (0) +# define irq_ctx_exit(cpu) do { } while (0) +# ifdef CONFIG_X86_64 +# define __ARCH_HAS_DO_SOFTIRQ +# endif +#endif + +#ifdef CONFIG_IRQBALANCE +extern int irqbalance_disable(char *str); +#endif + +#ifdef CONFIG_HOTPLUG_CPU +#include <linux/cpumask.h> +extern void fixup_irqs(cpumask_t map); #endif + +extern unsigned int do_IRQ(struct pt_regs *regs); +extern void init_IRQ(void); +extern void native_init_IRQ(void); + +/* Interrupt vector management */ +extern DECLARE_BITMAP(used_vectors, NR_VECTORS); + +#endif /* _ASM_IRQ_H */ diff --git a/include/asm-x86/irq_32.h b/include/asm-x86/irq_32.h deleted file mode 100644 index c5c7542..0000000 --- a/include/asm-x86/irq_32.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _ASM_IRQ_H -#define _ASM_IRQ_H - -/* - * linux/include/asm/irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@informatik.tu-chemnitz.de> - */ - -#include <linux/sched.h> -/* include comes from machine specific directory */ -#include <asm/irq_vectors.h> -#include <asm/thread_info.h> - -static inline int irq_canonicalize(int irq) -{ - return ((irq == 2) ? 9 : irq); -} - -#ifdef CONFIG_X86_LOCAL_APIC -# define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ -#endif - -#ifdef CONFIG_4KSTACKS - extern void irq_ctx_init(int cpu); - extern void irq_ctx_exit(int cpu); -# define __ARCH_HAS_DO_SOFTIRQ -#else -# define irq_ctx_init(cpu) do { } while (0) -# define irq_ctx_exit(cpu) do { } while (0) -#endif - -#ifdef CONFIG_IRQBALANCE -extern int irqbalance_disable(char *str); -#endif - -#ifdef CONFIG_HOTPLUG_CPU -extern void fixup_irqs(cpumask_t map); -#endif - -unsigned int do_IRQ(struct pt_regs *regs); -void init_IRQ(void); -void __init native_init_IRQ(void); - -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); - -#endif /* _ASM_IRQ_H */ diff --git a/include/asm-x86/irq_64.h b/include/asm-x86/irq_64.h deleted file mode 100644 index 1ef233d..0000000 --- a/include/asm-x86/irq_64.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _ASM_IRQ_H -#define _ASM_IRQ_H - -/* - * linux/include/asm/irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@informatik.tu-chemnitz.de> - */ - -#include <asm/apicdef.h> -#include <asm/irq_vectors.h> - -static inline int irq_canonicalize(int irq) -{ - return ((irq == 2) ? 9 : irq); -} - -#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ - -#ifdef CONFIG_HOTPLUG_CPU -#include <linux/cpumask.h> -extern void fixup_irqs(cpumask_t map); -#endif - -#define __ARCH_HAS_DO_SOFTIRQ 1 - -extern void native_init_IRQ(void); - -#endif /* _ASM_IRQ_H */ -- cgit v1.1 From 88a83350bc1955fa9d8fca059e19bc360fcef2ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 23:19:26 +0200 Subject: x86: declare setup_apic_routing Global functions need a prototype. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/genapic_64.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h index 1de931b..0f85046 100644 --- a/include/asm-x86/genapic_64.h +++ b/include/asm-x86/genapic_64.h @@ -44,4 +44,6 @@ DECLARE_PER_CPU(int, x2apic_extra_bits); extern void uv_cpu_init(void); extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip); +extern void setup_apic_routing(void); + #endif -- cgit v1.1 From 1a331957efd214fc3a84f70956dfaec65e70c031 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 3 May 2008 00:30:50 +0200 Subject: x86: move eisa_set_level_irq declaration to header Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 2 -- include/asm-x86/hw_irq.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc..671591f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -507,8 +507,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - extern void eisa_set_level_irq(unsigned int irq); - if (triggering == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 1db2dff..1428b41 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -79,6 +79,9 @@ extern void send_IPI(int dest, int vector); extern atomic_t irq_err_count; extern atomic_t irq_mis_count; +/* EISA */ +extern void eisa_set_level_irq(unsigned int irq); + /* Voyager functions */ extern asmlinkage void vic_cpi_interrupt(void); extern asmlinkage void vic_sys_interrupt(void); -- cgit v1.1 From 305b92a2323eeaa4b481f409d54f778dd7e21a46 Mon Sep 17 00:00:00 2001 From: Alan Mayer <ajm@sgi.com> Date: Tue, 15 Apr 2008 15:36:56 -0500 Subject: x86: change FIRST_SYSTEM_VECTOR to a variable The SGI UV system needs several more system vectors than a vanilla x86_64 system. Rather than burden the other archs with extra system vectors that they don't use, change FIRST_SYSTEM_VECTOR to a variable, so that it can be dynamic. Signed-off-by: Alan Mayer <ajm@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic_32.c | 14 +++++++------- arch/x86/kernel/i8259_64.c | 30 +++++++++++++++--------------- arch/x86/kernel/io_apic_32.c | 8 ++++++-- arch/x86/kernel/io_apic_64.c | 6 +++++- include/asm-x86/desc.h | 22 ++++++++++++++++++++++ include/asm-x86/irq_vectors.h | 2 -- 6 files changed, 55 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..807158e 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1351,13 +1351,13 @@ void __init smp_intr_init(void) * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } #endif @@ -1370,15 +1370,15 @@ void __init apic_intr_init(void) smp_intr_init(); #endif /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* thermal monitor LVT interrupt */ #ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif } diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index c4ae476..1870e0e 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -493,33 +493,33 @@ void __init native_init_IRQ(void) * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54f..0ae4a9d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -83,6 +83,10 @@ int mp_irq_entries; static int disable_timer_pin_1 __initdata; +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -1176,7 +1180,7 @@ static int __assign_irq_vector(int irq) offset = current_offset; next: vector += 8; - if (vector >= FIRST_SYSTEM_VECTOR) { + if (vector >= first_system_vector) { offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } @@ -2269,7 +2273,7 @@ void __init setup_IO_APIC(void) int i; /* Reserve all the system vectors. */ - for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) + for (i = first_system_vector; i < NR_VECTORS; i++) set_bit(i, used_vectors); enable_IO_APIC(); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8df..f1e1ae3 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { static int assign_irq_vector(int irq, cpumask_t mask); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + #define __apicdebuginit __init int sis_apic_bug; /* not actually supported, dummy for compile */ @@ -730,7 +734,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) offset = current_offset; next: vector += 8; - if (vector >= FIRST_SYSTEM_VECTOR) { + if (vector >= first_system_vector) { /* If we run out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index 268a012..b3875d4 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -311,6 +311,28 @@ static inline void set_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); } +#define SYS_VECTOR_FREE 0 +#define SYS_VECTOR_ALLOCED 1 + +extern int first_system_vector; +extern char system_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (system_vectors[vector] == SYS_VECTOR_FREE) { + system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (first_system_vector > vector) + first_system_vector = vector; + } else + BUG(); +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + /* * This routine sets up an interrupt gate at directory privilege level 3. */ diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index daceaaf..3cb6d8c 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -96,8 +96,6 @@ # define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) #endif -#define FIRST_SYSTEM_VECTOR 0xef - #define NR_VECTORS 256 #define FPU_IRQ 13 -- cgit v1.1 From ce17833183bf0a08ce3d174a2088eff0a06f2080 Mon Sep 17 00:00:00 2001 From: Alan Mayer <ajm@sgi.com> Date: Wed, 16 Apr 2008 15:17:20 -0500 Subject: x86: change FIRST_SYSTEM_VECTOR to a variable, fix Fixes the build error introduced by my FIRST_SYSTEM_VECTOR patch Signed-off-by: Alan Mayer <ajm@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apic_32.c | 4 ++++ arch/x86/kernel/io_apic_32.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 807158e..d5767cb 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -71,6 +71,10 @@ int local_apic_timer_disabled; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Debug level, exported for io_apic.c */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 0ae4a9d..76769cc 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -83,10 +83,6 @@ int mp_irq_entries; static int disable_timer_pin_1 __initdata; -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. -- cgit v1.1 From 04b361abfdc522239e3a071f3afdebf5787d9f03 Mon Sep 17 00:00:00 2001 From: Andi Kleen <andi@firstfloor.org> Date: Mon, 5 May 2008 12:36:38 +0200 Subject: i386: Execute stack overflow warning on interrupt stack v2 Previously the reporting printk would run on the process stack, which risks overflow an already low stack. Instead execute it on the interrupt stack. This makes it more likely for the printk to make it actually out. It adds one not taken test/branch more to the interrupt path when stack overflow checking is enabled. We could avoid that by duplicating more code, but that seemed not worth it. Based on an observation by Eric Sandeen. v2: Fix warnings in some configs Signed-off-by: Andi Kleen <andi@firstfloor.org> Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/irq_32.c | 49 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 147352d..3f76561 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -61,6 +61,26 @@ static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; #endif +static void stack_overflow(void) +{ + printk("low stack detected by irq handler\n"); + dump_stack(); +} + +static inline void call_on_stack2(void *func, void *stack, + unsigned long arg1, unsigned long arg2) +{ + unsigned long bx; + asm volatile( + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) + : "0" (arg1), "1" (arg2), "2" (stack), + "D" (func) + : "memory", "cc", "ecx"); +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific @@ -76,6 +96,7 @@ unsigned int do_IRQ(struct pt_regs *regs) union irq_ctx *curctx, *irqctx; u32 *isp; #endif + int overflow = 0; if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -92,11 +113,8 @@ unsigned int do_IRQ(struct pt_regs *regs) __asm__ __volatile__("andl %%esp,%0" : "=r" (sp) : "0" (THREAD_SIZE - 1)); - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", - sp - sizeof(struct thread_info)); - dump_stack(); - } + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) + overflow = 1; } #endif @@ -112,8 +130,6 @@ unsigned int do_IRQ(struct pt_regs *regs) * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, bx; - /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); irqctx->tinfo.task = curctx->tinfo.task; @@ -127,18 +143,19 @@ unsigned int do_IRQ(struct pt_regs *regs) (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | (curctx->tinfo.preempt_count & SOFTIRQ_MASK); - asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (bx) - : "0" (irq), "1" (desc), "2" (isp), - "D" (desc->handle_irq) - : "memory", "cc", "ecx" - ); + /* Execute warning on interrupt stack */ + if (unlikely(overflow)) + call_on_stack2(stack_overflow, isp, 0, 0); + + call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc); } else #endif + { + /* AK: Slightly bogus here */ + if (overflow) + stack_overflow(); desc->handle_irq(irq, desc); + } irq_exit(); set_irq_regs(old_regs); -- cgit v1.1 From de9b10af1287bf25b9c0433de53a2e95ef611aa7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 5 May 2008 15:58:15 +0200 Subject: x86: janitor stack overflow warning patch Add KERN_WARNING to the printk as this could not be done in the original patch, which allegedly only moves code around. Un#ifdef do_IRQ. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/irq_32.c | 136 ++++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 61 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3f76561..1c470d2 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq) #endif } +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + #ifdef CONFIG_4KSTACKS /* * per-CPU IRQ handling contexts (thread information and stack) @@ -59,18 +82,12 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -#endif - -static void stack_overflow(void) -{ - printk("low stack detected by irq handler\n"); - dump_stack(); -} -static inline void call_on_stack2(void *func, void *stack, - unsigned long arg1, unsigned long arg2) +static inline void call_on_stack(void *func, void *stack, + unsigned long arg1, void *arg2) { unsigned long bx; + asm volatile( " xchgl %%ebx,%%esp \n" " call *%%edi \n" @@ -81,22 +98,61 @@ static inline void call_on_stack2(void *func, void *stack, : "memory", "cc", "ecx"); } +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) +{ + union irq_ctx *curctx, *irqctx; + u32 *isp; + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curctx == irqctx)) + return 0; + + /* build the stack frame on the IRQ stack */ + isp = (u32 *) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp, 0, NULL); + + call_on_stack(desc->handle_irq, isp, irq, desc); + + return 1; +} + +#else +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } +#endif + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ unsigned int do_IRQ(struct pt_regs *regs) -{ +{ struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_ax; + int overflow, irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; -#ifdef CONFIG_4KSTACKS - union irq_ctx *curctx, *irqctx; - u32 *isp; -#endif - int overflow = 0; if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -106,54 +162,12 @@ unsigned int do_IRQ(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_DEBUG_STACKOVERFLOW - /* Debugging check for stack overflow: is there less than 1KB free? */ - { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); - if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) - overflow = 1; - } -#endif - -#ifdef CONFIG_4KSTACKS - curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + overflow = check_stack_overflow(); - /* - * this is where we switch to the IRQ stack. However, if we are - * already using the IRQ stack (because we interrupted a hardirq - * handler) we can't do that and just have to keep using the - * current stack (which is the irq stack already after all) - */ - if (curctx != irqctx) { - /* build the stack frame on the IRQ stack */ - isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); - irqctx->tinfo.task = curctx->tinfo.task; - irqctx->tinfo.previous_esp = current_stack_pointer; - - /* - * Copy the softirq bits in preempt_count so that the - * softirq checks work in the hardirq context. - */ - irqctx->tinfo.preempt_count = - (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | - (curctx->tinfo.preempt_count & SOFTIRQ_MASK); - - /* Execute warning on interrupt stack */ + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) - call_on_stack2(stack_overflow, isp, 0, 0); - - call_on_stack2(desc->handle_irq, isp, irq, (unsigned long)desc); - } else -#endif - { - /* AK: Slightly bogus here */ - if (overflow) - stack_overflow(); + print_stack_overflow(); desc->handle_irq(irq, desc); } -- cgit v1.1 From 403d8efc94cd02ae36e7db13c4edf1d06d7b7fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 5 May 2008 18:13:50 +0200 Subject: x86: irq_32 move 4kstacks code to one place Move the 4KSTACKS related code to one place. This allows to un#ifdef do_IRQ() and share the executed on stack for the stack overflow printk and the softirq call. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/irq_32.c | 163 ++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 86 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1c470d2..4e3e8ec 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,26 +83,28 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static inline void call_on_stack(void *func, void *stack, - unsigned long arg1, void *arg2) +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +static void call_on_stack(void *func, void *stack) { - unsigned long bx; - - asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (bx) - : "0" (arg1), "1" (arg2), "2" (stack), - "D" (func) - : "memory", "cc", "ecx"); + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { union irq_ctx *curctx, *irqctx; - u32 *isp; + u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); irqctx = hardirq_ctx[smp_processor_id()]; @@ -130,64 +132,22 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) (curctx->tinfo.preempt_count & SOFTIRQ_MASK); if (unlikely(overflow)) - call_on_stack(print_stack_overflow, isp, 0, NULL); - - call_on_stack(desc->handle_irq, isp, irq, desc); - - return 1; -} - -#else -static inline int -execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } -#endif - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -unsigned int do_IRQ(struct pt_regs *regs) -{ - struct pt_regs *old_regs; - /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; - - if (unlikely((unsigned)irq >= NR_IRQS)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } - - old_regs = set_irq_regs(regs); - irq_enter(); - - overflow = check_stack_overflow(); - - if (!execute_on_irq_stack(overflow, desc, irq)) { - if (unlikely(overflow)) - print_stack_overflow(); - desc->handle_irq(irq, desc); - } - - irq_exit(); - set_irq_regs(old_regs); + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (isp) + : "0" (irq), "1" (desc), "2" (isp), + "D" (desc->handle_irq) + : "memory", "cc", "ecx"); return 1; } -#ifdef CONFIG_4KSTACKS - -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - /* * allocate per-cpu stacks for hardirq and for softirq processing */ -void irq_ctx_init(int cpu) +void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; @@ -195,25 +155,25 @@ void irq_ctx_init(int cpu) return; irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); hardirq_ctx[cpu] = irqctx; irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; - irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = 0; - irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); softirq_ctx[cpu] = irqctx; - printk("CPU %u irqstacks, hard=%p soft=%p\n", - cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); } void irq_ctx_exit(int cpu) @@ -242,25 +202,56 @@ asmlinkage void do_softirq(void) /* build the stack frame on the softirq stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); - asm volatile( - " xchgl %%ebx,%%esp \n" - " call __do_softirq \n" - " movl %%ebx,%%esp \n" - : "=b"(isp) - : "0"(isp) - : "memory", "cc", "edx", "ecx", "eax" - ); + call_on_stack(__do_softirq, isp); /* * Shouldnt happen, we returned above if in_interrupt(): - */ + */ WARN_ON_ONCE(softirq_count()); } local_irq_restore(flags); } + +#else +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } #endif /* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + /* high bit used in ret_from_ code */ + int overflow, irq = ~regs->orig_ax; + struct irq_desc *desc = irq_desc + irq; + + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __func__, irq); + BUG(); + } + + old_regs = set_irq_regs(regs); + irq_enter(); + + overflow = check_stack_overflow(); + + if (!execute_on_irq_stack(overflow, desc, irq)) { + if (unlikely(overflow)) + print_stack_overflow(); + desc->handle_irq(irq, desc); + } + + irq_exit(); + set_irq_regs(old_regs); + return 1; +} + +/* * Interrupt statistics: */ -- cgit v1.1 From 988f7b5789ccf5cfed14c72e28573a49f0cb4809 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com> Date: Tue, 18 Mar 2008 17:00:22 -0700 Subject: x86: PAT export resource_wc in pci sysfs For the ranges with IORESOURCE_PREFETCH, export a new resource_wc interface in pci /sysfs along with resource (which is uncached). Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- Documentation/filesystems/sysfs-pci.txt | 1 + drivers/pci/pci-sysfs.c | 84 ++++++++++++++++++++++++--------- include/linux/pci.h | 1 + 3 files changed, 64 insertions(+), 22 deletions(-) diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 5daa2aa..68ef488 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -36,6 +36,7 @@ files, each with their own function. local_cpus nearby CPU mask (cpumask, ro) resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap) + resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) subsystem_vendor PCI subsystem vendor (ascii, ro) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 271d41c..9ec7d39 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -489,13 +489,14 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr, * @kobj: kobject for mapping * @attr: struct bin_attribute for the file being mapped * @vma: struct vm_area_struct passed into the mmap + * @write_combine: 1 for write_combine mapping * * Use the regular PCI mapping routines to map a PCI resource into userspace. * FIXME: write combining? maybe automatic for prefetchable regions? */ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, int write_combine) { struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj)); @@ -518,7 +519,21 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, vma->vm_pgoff += start >> PAGE_SHIFT; mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; - return pci_mmap_page_range(pdev, vma, mmap_type, 0); + return pci_mmap_page_range(pdev, vma, mmap_type, write_combine); +} + +static int +pci_mmap_resource_uc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 0); +} + +static int +pci_mmap_resource_wc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 1); } /** @@ -541,9 +556,46 @@ pci_remove_resource_files(struct pci_dev *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); kfree(res_attr); } + + res_attr = pdev->res_attr_wc[i]; + if (res_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); + kfree(res_attr); + } } } +static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) +{ + /* allocate attribute structure, piggyback attribute name */ + int name_len = write_combine ? 13 : 10; + struct bin_attribute *res_attr; + int retval; + + res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC); + if (res_attr) { + char *res_attr_name = (char *)(res_attr + 1); + + if (write_combine) { + pdev->res_attr_wc[num] = res_attr; + sprintf(res_attr_name, "resource%d_wc", num); + res_attr->mmap = pci_mmap_resource_wc; + } else { + pdev->res_attr[num] = res_attr; + sprintf(res_attr_name, "resource%d", num); + res_attr->mmap = pci_mmap_resource_uc; + } + res_attr->attr.name = res_attr_name; + res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->size = pci_resource_len(pdev, num); + res_attr->private = &pdev->resource[num]; + retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); + } else + retval = -ENOMEM; + + return retval; +} + /** * pci_create_resource_files - create resource files in sysfs for @dev * @dev: dev in question @@ -557,31 +609,19 @@ static int pci_create_resource_files(struct pci_dev *pdev) /* Expose the PCI resources from this device as files */ for (i = 0; i < PCI_ROM_RESOURCE; i++) { - struct bin_attribute *res_attr; /* skip empty resources */ if (!pci_resource_len(pdev, i)) continue; - /* allocate attribute structure, piggyback attribute name */ - res_attr = kzalloc(sizeof(*res_attr) + 10, GFP_ATOMIC); - if (res_attr) { - char *res_attr_name = (char *)(res_attr + 1); - - pdev->res_attr[i] = res_attr; - sprintf(res_attr_name, "resource%d", i); - res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; - res_attr->size = pci_resource_len(pdev, i); - res_attr->mmap = pci_mmap_resource; - res_attr->private = &pdev->resource[i]; - retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); - if (retval) { - pci_remove_resource_files(pdev); - return retval; - } - } else { - return -ENOMEM; + retval = pci_create_attr(pdev, i, 0); + /* for prefetchable resources, create a WC mappable file */ + if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH) + retval = pci_create_attr(pdev, i, 1); + + if (retval) { + pci_remove_resource_files(pdev); + return retval; } } return 0; diff --git a/include/linux/pci.h b/include/linux/pci.h index 509159b..d18b1dd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -206,6 +206,7 @@ struct pci_dev { struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ + struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ #ifdef CONFIG_PCI_MSI struct list_head msi_list; #endif -- cgit v1.1 From 77b52b4c5c66175553ee59eb43f74366f1e54bde Mon Sep 17 00:00:00 2001 From: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Mon, 5 May 2008 19:09:10 -0700 Subject: x86: add "debugpat" boot option enable debug messages by a boot option "debugpat". Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/mm/pat.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 60adbe2..1a82f4d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -42,6 +42,19 @@ static int nopat(char *str) early_param("nopat", nopat); #endif + +static int debug_enable; +static int __init pat_debug_setup(char *str) +{ + debug_enable = 1; + return 0; +} +__setup("debugpat", pat_debug_setup); + +#define dprintk(fmt, arg...) \ + do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + + static u64 __read_mostly boot_pat_state; enum { @@ -279,7 +292,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, struct memtype *saved_ptr; if (parse->start >= end) { - pr_debug("New Entry\n"); + dprintk("New Entry\n"); list_add(&new_entry->nd, parse->nd.prev); new_entry = NULL; break; @@ -329,7 +342,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - pr_debug("Overlap at 0x%Lx-0x%Lx\n", + dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, saved_ptr->nd.prev); @@ -381,7 +394,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", + dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, &saved_ptr->nd); @@ -403,16 +416,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_entry) { /* No conflict. Not yet added to the list. Add to the tail */ list_add_tail(&new_entry->nd, &memtype_list); - pr_debug("New Entry\n"); + dprintk("New Entry\n"); } if (ret_type) { - pr_debug( + dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), cattr_name(req_type), cattr_name(*ret_type)); } else { - pr_debug( + dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(actual_type), cattr_name(req_type)); @@ -453,7 +466,7 @@ int free_memtype(u64 start, u64 end) current->comm, current->pid, start, end); } - pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); return err; } -- cgit v1.1 From f8096f92b87d81c55ed63964d27baa9ce5ffe508 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 22 Apr 2008 16:27:29 +0100 Subject: x86: separate cmpxchg8b checking from PAE checking .. allowing the former to be use in non-PAE kernels, too. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig.cpu | 4 ++++ include/asm-x86/required-features.h | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2ad6301..3d22bb8 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -399,6 +399,10 @@ config X86_TSC def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 +config X86_CMPXCHG64 + def_bool y + depends on X86_PAE || X86_64 + # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV diff --git a/include/asm-x86/required-features.h b/include/asm-x86/required-features.h index 7400d3a..8c38719 100644 --- a/include/asm-x86/required-features.h +++ b/include/asm-x86/required-features.h @@ -19,9 +19,13 @@ #if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) # define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) #else # define NEED_PAE 0 +#endif + +#ifdef CONFIG_X86_CMPXCHG64 +# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) +#else # define NEED_CX8 0 #endif -- cgit v1.1 From aa134f1b09df6beaa4d031a50d5fda1f3cebce6c Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@ucw.cz> Date: Tue, 8 Apr 2008 10:49:03 +0200 Subject: x86: iommu: use symbolic constants, not hardcoded numbers Move symbolic constants into gart.h, and use them instead of hardcoded constant. Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- drivers/char/agp/amd64-agp.c | 25 +++---------------------- include/asm-x86/gart.h | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d..bffcf45 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -598,13 +598,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) dev = k8_northbridges[i]; gatt_reg = __pa(gatt) >> 12; gatt_reg <<= 4; - pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= 1; - ctl &= ~((1<<4) | (1<<5)); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } flush_gart(); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index d8200ac..25d6422 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -16,28 +16,9 @@ #include <asm/page.h> /* PAGE_SIZE */ #include <asm/e820.h> #include <asm/k8.h> +#include <asm/gart.h> #include "agp.h" -/* PTE bits. */ -#define GPTE_VALID 1 -#define GPTE_COHERENT 2 - -/* Aperture control register bits. */ -#define GARTEN (1<<0) -#define DISGARTCPU (1<<4) -#define DISGARTIO (1<<5) - -/* GART cache control register bits. */ -#define INVGART (1<<0) -#define GARTPTEERR (1<<1) - -/* K8 On-cpu GART registers */ -#define AMD64_GARTAPERTURECTL 0x90 -#define AMD64_GARTAPERTUREBASE 0x94 -#define AMD64_GARTTABLEBASE 0x98 -#define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) - /* NVIDIA K8 registers */ #define NVIDIA_X86_64_0_APBASE 0x10 #define NVIDIA_X86_64_1_APBASE1 0x50 @@ -165,7 +146,7 @@ static int amd64_fetch_size(void) * In a multiprocessor x86-64 system, this function gets * called once for each CPU. */ -static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table) +static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table) { u64 aperturebase; u32 tmp; @@ -181,7 +162,7 @@ static u64 amd64_configure (struct pci_dev *hammer, u64 gatt_table) addr >>= 12; tmp = (u32) addr<<4; tmp &= ~0xf; - pci_write_config_dword (hammer, AMD64_GARTTABLEBASE, tmp); + pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp); /* Enable GART translation for this hammer. */ pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp); diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 90958ed..248e577 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -5,6 +5,7 @@ extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; extern int iommu_detected; +extern int agp_amd64_init(void); #ifdef CONFIG_GART_IOMMU extern void gart_iommu_init(void); extern void gart_iommu_shutdown(void); @@ -31,4 +32,24 @@ static inline void gart_iommu_shutdown(void) #endif +/* PTE bits. */ +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 + +/* Aperture control register bits. */ +#define GARTEN (1<<0) +#define DISGARTCPU (1<<4) +#define DISGARTIO (1<<5) + +/* GART cache control register bits. */ +#define INVGART (1<<0) +#define GARTPTEERR (1<<1) + +/* K8 On-cpu GART registers */ +#define AMD64_GARTAPERTURECTL 0x90 +#define AMD64_GARTAPERTUREBASE 0x94 +#define AMD64_GARTTABLEBASE 0x98 +#define AMD64_GARTCACHECTL 0x9c +#define AMD64_GARTEN (1<<0) + #endif -- cgit v1.1 From 1edc1ab3f68168ec6815e6d630f38948a6da005a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Sun, 13 Apr 2008 01:11:41 -0700 Subject: x86: agp_gart size checking for buggy device while looking at Rafael J. Wysocki's system boot log, I found a funny printout: Node 0: aperture @ de000000 size 32 MB Aperture too small (32 MB) AGP bridge at 00:04:00 Aperture from AGP @ de000000 size 4096 MB (APSIZE 0) Aperture too small (0 MB) Your BIOS doesn't leave a aperture memory hole Please enable the IOMMU option in the BIOS setup This costs you 64 MB of RAM Mapping aperture over 65536 KB of RAM @ 4000000 ... agpgart: Detected AGP bridge 20 agpgart: Aperture pointing to RAM agpgart: Aperture from AGP @ de000000 size 4096 MB agpgart: Aperture too small (0 MB) agpgart: No usable aperture found. agpgart: Consider rebooting with iommu=memaper=2 to get a good aperture. it means BIOS allocated the correct gart on the NB and AGP bridge, but because a bug in the silicon (the agp bridge reports the wrong order, it wants 4G instead) the kernel will reject that allocation. Also, because the size is only 32MB, and we try to get another 64M for gart, late fix_northbridge can not revert that change because it still reads the wrong size from agp bridge. So try to double check the order value from the agp bridge, before calling aperture_valid(). [ mingo@elte.hu: 32-bit fix. ] Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 14 ++++++++++++++ drivers/char/agp/amd64-agp.c | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 479926d..2e93b31 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -138,6 +138,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) int nbits; u32 aper_low, aper_hi; u64 aper; + u32 old_order; printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); @@ -146,6 +147,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) return 0; } + /* old_order could be the value from NB gart setting */ + old_order = *order; + apsize = apsizereg & 0xfff; /* Some BIOS use weird encodings not in the AGPv3 table. */ if (apsize & 0xff) @@ -159,6 +163,16 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) aper_hi = read_pci_config(num, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + /* + * On some sick chips, APSIZE is 0. It means it wants 4G + * so let double check that order, and lets trust AMD NB settings: + */ + if (aper + (32UL<<(20 + *order)) > 0x100000000UL) { + printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", + 32 << *order, apsizereg); + *order = old_order; + } + printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 25d6422..9e3939d 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -312,6 +312,17 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, pci_read_config_dword(agp, 0x10, &aper_low); pci_read_config_dword(agp, 0x14, &aper_hi); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + /* + * On some sick chips APSIZE is 0. This means it wants 4G + * so let double check that order, and lets trust the AMD NB settings + */ + if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) { + printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n", + 32 << order); + order = nb_order; + } + printk(KERN_INFO PFX "Aperture from AGP @ %Lx size %u MB\n", aper, 32 << order); if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order)) return -1; -- cgit v1.1 From 8c9fd91a0dc503f085169d44f4360be025f75224 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 13 Apr 2008 18:42:31 -0700 Subject: x86: checking aperture size order some systems are using 32M for gart and agp when memory is less than 4G. Kernel will reject and try to allcate another 64M that is not needed, and we will waste 64M of perfectly good RAM. this patch adds a workaround by checking aper_base/order between NB and agp bridge. If they are the same, and memory size is less than 4G, it will allow it. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 47 ++++++++++++++++++++++++++++++++----------- drivers/char/agp/amd64-agp.c | 12 +++++------ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 2e93b31..8c325b7f 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -83,7 +83,7 @@ static u32 __init allocate_aperture(void) return (u32)__pa(p); } -static int __init aperture_valid(u64 aper_base, u32 aper_size) +static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) { if (!aper_base) return 0; @@ -96,8 +96,9 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size) printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); return 0; } - if (aper_size < 64*1024*1024) { - printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); + if (aper_size < min_size) { + printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", + aper_size>>20, min_size>>20); return 0; } @@ -167,7 +168,9 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) * On some sick chips, APSIZE is 0. It means it wants 4G * so let double check that order, and lets trust AMD NB settings: */ - if (aper + (32UL<<(20 + *order)) > 0x100000000UL) { + printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", + aper, 32 << old_order); + if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", 32 << *order, apsizereg); *order = old_order; @@ -176,7 +179,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); - if (!aperture_valid(aper, (32*1024*1024) << *order)) + if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) return 0; return (u32)aper; } @@ -302,8 +305,8 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (e820_any_mapped(aper_base, aper_base + aper_size, - E820_RAM)) { + if (!e820_all_mapped(aper_base, aper_base + aper_size, + E820_RESERVED)) { /* reserved it, so we can resuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); @@ -324,8 +327,11 @@ void __init early_gart_iommu_check(void) } +static int __initdata printed_gart_size_msg; + void __init gart_iommu_hole_init(void) { + u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; int fix, num, valid_agp = 0; @@ -337,6 +343,9 @@ void __init gart_iommu_hole_init(void) printk(KERN_INFO "Checking aperture...\n"); + if (!fallback_aper_force) + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + fix = 0; node = 0; for (num = 24; num < 32; num++) { @@ -355,9 +364,21 @@ void __init gart_iommu_hole_init(void) node, aper_base, aper_size >> 20); node++; - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { + printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); + printk(KERN_ERR "please increase GART size in your BIOS setup\n"); + printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + break; + } } if ((last_aper_order && aper_order != last_aper_order) || @@ -378,8 +399,10 @@ void __init gart_iommu_hole_init(void) return; } - if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + if (!fallback_aper_force) { + aper_alloc = agp_aper_base; + aper_order = agp_aper_order; + } if (aper_alloc) { /* Got the aperture from the AGP bridge */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 9e3939d..9c24470 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -245,11 +245,7 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "No aperture\n"); return 0; } - if (size < 32*1024*1024) { - printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); - return 0; - } - if ((u64)aper + size > 0x100000000ULL) { + if ((u64)aper + size > 0x100000000ULL) { printk(KERN_ERR PFX "Aperture out of bounds\n"); return 0; } @@ -257,6 +253,10 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "Aperture pointing to RAM\n"); return 0; } + if (size < 32*1024*1024) { + printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); + return 0; + } /* Request the Aperture. This catches cases when someone else already put a mapping in there - happens with some very broken BIOS @@ -317,7 +317,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, * On some sick chips APSIZE is 0. This means it wants 4G * so let double check that order, and lets trust the AMD NB settings */ - if (aper + (32ULL<<(20 + order)) > 0x100000000ULL) { + if (order >=0 && aper + (32ULL<<(20 + order)) > 0x100000000ULL) { printk(KERN_INFO "Aperture size %u MB is not right, using settings from NB\n", 32 << order); order = nb_order; -- cgit v1.1 From 7677b2ef6c0c4fddc84f6473f3863f40eb71821b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Mon, 14 Apr 2008 20:40:37 -0700 Subject: x86_64: allocate gart aperture from 512M because we try to reserve dma32 early, so we have chance to get aperture from 64M. with some sequence aperture allocated from RAM, could become E820_RESERVED. and then if doing a kexec with a big kernel that uncompressed size is above 64M we could have a range conflict with still using gart. So allocate gart aperture from 512M instead. Also change the fallback_aper_order to 5, because we don't have chance to get 2G or 4G aperture. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 20 +++++++++++++++++--- arch/x86/kernel/pci-dma.c | 6 +++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 8c325b7f..c63f8d9 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -55,8 +55,9 @@ static u32 __init allocate_aperture(void) u32 aper_size; void *p; - if (fallback_aper_order > 7) - fallback_aper_order = 7; + /* aper_size should <= 1G */ + if (fallback_aper_order > 5) + fallback_aper_order = 5; aper_size = (32 * 1024 * 1024) << fallback_aper_order; /* @@ -65,7 +66,20 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); + /* + * using 512M as goal, in case kexec will load kernel_big + * that will do the on position decompress, and could overlap with + * that positon with gart that is used. + * sequende: + * kernel_small + * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) + * ==> kernel_small(gart area become e820_reserved) + * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * so don't use 512M below as gart iommu, leave the space for kernel + * code for safe + */ + p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0c37f16..1a017f0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -77,10 +77,14 @@ void __init dma32_reserve_bootmem(void) if (end_pfn <= MAX_DMA32_PFN) return; + /* + * check aperture_64.c allocate_aperture() for reason about + * using 512M as goal + */ align = 64ULL<<20; size = round_up(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, - __pa(MAX_DMA_ADDRESS)); + 512ULL<<20); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else -- cgit v1.1 From 55c0d721df80dcc505dc888e85d4ca51ea150ce9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Sat, 19 Apr 2008 01:31:11 -0700 Subject: x86: clean up aperture_64.c 1. use symbolic register names where appropriate. 2. num to bus or slot changing 3. handle for new opteron for bus other than 0 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 230 +++++++++++++++++++++++++----------------- 1 file changed, 139 insertions(+), 91 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index c63f8d9..02f4dba 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -35,6 +35,18 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; +struct bus_dev_range { + int bus; + int dev_base; + int dev_limit; +}; + +static struct bus_dev_range bus_dev_ranges[] __initdata = { + { 0x00, 0x18, 0x20}, + { 0xff, 0x00, 0x20}, + { 0xfe, 0x00, 0x20} +}; + static struct resource gart_resource = { .name = "GART", .flags = IORESOURCE_MEM, @@ -120,33 +132,33 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) } /* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) +static __u32 __init find_cap(int bus, int slot, int func, int cap) { int bytes; u8 pos; - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & PCI_STATUS_CAP_LIST)) return 0; - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { u8 id; pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); if (id == 0xff) break; if (id == cap) return pos; - pos = read_pci_config_byte(num, slot, func, + pos = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_NEXT); } return 0; } /* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) { u32 apsize; u32 apsizereg; @@ -155,8 +167,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) u64 aper; u32 old_order; - printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); + printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); + apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); return 0; @@ -174,8 +186,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) if ((int)*order < 0) /* < 32MB */ *order = 0; - aper_low = read_pci_config(num, slot, func, 0x10); - aper_hi = read_pci_config(num, slot, func, 0x14); + aper_low = read_pci_config(bus, slot, func, 0x10); + aper_hi = read_pci_config(bus, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); /* @@ -213,15 +225,15 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) */ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) { - int num, slot, func; + int bus, slot, func; /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { + for (bus = 0; bus < 256; bus++) { for (slot = 0; slot < 32; slot++) { for (func = 0; func < 8; func++) { u32 class, cap; u8 type; - class = read_pci_config(num, slot, func, + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) break; @@ -230,17 +242,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(num, slot, func, + cap = find_cap(bus, slot, func, PCI_CAP_ID_AGP); if (!cap) break; *valid_agp = 1; - return read_agp(num, slot, func, cap, + return read_agp(bus, slot, func, cap, order); } /* No multi-function device? */ - type = read_pci_config_byte(num, slot, func, + type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) break; @@ -280,38 +292,49 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int fix, num; + int fix, slot; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; int aper_enabled = 0, last_aper_enabled = 0; + int i; if (!early_pci_allowed()) return; fix = 0; - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - ctl = read_pci_config(0, num, 3, 0x90); - aper_enabled = ctl & 1; - aper_order = (ctl >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base) || - (last_aper_enabled && aper_enabled != last_aper_enabled)) { - fix = 1; - break; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + aper_enabled = ctl & AMD64_GARTEN; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base) || + (last_aper_enabled && aper_enabled != last_aper_enabled)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; } - last_aper_order = aper_order; - last_aper_base = aper_base; - last_aper_enabled = aper_enabled; } +out: if (!fix && !aper_enabled) return; @@ -330,13 +353,22 @@ void __init early_gart_iommu_check(void) } /* different nodes have different setting, disable them all at first*/ - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; - ctl = read_pci_config(0, num, 3, 0x90); - ctl &= ~1; - write_pci_config(0, num, 3, 0x90, ctl); + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + ctl &= ~AMD64_GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + } } } @@ -348,8 +380,8 @@ void __init gart_iommu_hole_init(void) u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; - int fix, num, valid_agp = 0; - int node; + int fix, slot, valid_agp = 0; + int i, node; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) @@ -362,48 +394,58 @@ void __init gart_iommu_hole_init(void) fix = 0; node = 0; - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - iommu_detected = 1; - gart_iommu_aperture = 1; - - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", - node, aper_base, aper_size >> 20); - node++; - - if (!aperture_valid(aper_base, aper_size, 64<<20)) { - if (valid_agp && agp_aper_base && - agp_aper_base == aper_base && - agp_aper_order == aper_order) { - /* the same between two setting from NB and agp */ - if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { - printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); - printk(KERN_ERR "please increase GART size in your BIOS setup\n"); - printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); - printed_gart_size_msg = 1; + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + iommu_detected = 1; + gart_iommu_aperture = 1; + + aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); + node++; + + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { + printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); + printk(KERN_ERR "please increase GART size in your BIOS setup\n"); + printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + goto out; } - } else { - fix = 1; - break; } - } - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base)) { - fix = 1; - break; + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; } - last_aper_order = aper_order; - last_aper_base = aper_base; } +out: if (!fix && !fallback_aper_force) { if (last_aper_base) { unsigned long n = (32 * 1024 * 1024) << last_aper_order; @@ -452,16 +494,22 @@ void __init gart_iommu_hole_init(void) } /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { - if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* - * Don't enable translation yet. That is done later. - * Assume this BIOS didn't initialise the GART so - * just overwrite all previous bits - */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + int bus; + int dev_base, dev_limit; + + bus = bus_dev_ranges[i].bus; + dev_base = bus_dev_ranges[i].dev_base; + dev_limit = bus_dev_ranges[i].dev_limit; + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); + } } } -- cgit v1.1 From 330fce23dab6e6a3d1979e55f27aba4c0c301331 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Sat, 19 Apr 2008 01:31:45 -0700 Subject: x86: reserve dma32 early for gart fix we can use free_bootmem() directly. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/pci-dma.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a017f0..81862d0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -92,7 +92,6 @@ void __init dma32_reserve_bootmem(void) } static void __init dma32_free_bootmem(void) { - int node; if (end_pfn <= MAX_DMA32_PFN) return; @@ -100,9 +99,7 @@ static void __init dma32_free_bootmem(void) if (!dma32_bootmem_ptr) return; - for_each_online_node(node) - free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr), - dma32_bootmem_size); + free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; -- cgit v1.1 From 3bb6fbf9969a8bbe4892968659239273d092e78a Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@ucw.cz> Date: Tue, 15 Apr 2008 12:43:57 +0200 Subject: x86 gart: factor out common code Cleanup gart handling on amd64 a bit: move common code into enable_gart_translation , and use symbolic register names where appropriate. Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/pci-gart_64.c | 23 ++++++----------------- drivers/char/agp/amd64-agp.c | 29 +++++++++-------------------- include/asm-x86/gart.h | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index bffcf45..1f99b62 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -533,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) unsigned aper_size = 0, aper_base_32, aper_order; u64 aper_base; - pci_read_config_dword(dev, 0x94, &aper_base_32); - pci_read_config_dword(dev, 0x90, &aper_order); + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); aper_order = (aper_order >> 1) & 7; aper_base = aper_base_32 & 0x7fff; @@ -592,19 +592,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; for (i = 0; i < num_k8_northbridges; i++) { - u32 gatt_reg; - u32 ctl; - dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, gatt_reg); - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + enable_gart_translation(dev, __pa(gatt)); } flush_gart(); @@ -648,11 +637,11 @@ void gart_iommu_shutdown(void) u32 ctl; dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl &= ~1; + ctl &= ~GARTEN; - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } } diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 9c24470..e3c7ea0 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -150,25 +150,14 @@ static u64 amd64_configure(struct pci_dev *hammer, u64 gatt_table) { u64 aperturebase; u32 tmp; - u64 addr, aper_base; + u64 aper_base; /* Address to map to */ - pci_read_config_dword (hammer, AMD64_GARTAPERTUREBASE, &tmp); + pci_read_config_dword(hammer, AMD64_GARTAPERTUREBASE, &tmp); aperturebase = tmp << 25; aper_base = (aperturebase & PCI_BASE_ADDRESS_MEM_MASK); - /* address of the mappings table */ - addr = (u64) gatt_table; - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(hammer, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(hammer, AMD64_GARTAPERTURECTL, &tmp); - tmp |= GARTEN; - tmp &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(hammer, AMD64_GARTAPERTURECTL, tmp); + enable_gart_translation(hammer, gatt_table); return aper_base; } @@ -207,9 +196,9 @@ static void amd64_cleanup(void) for (i = 0; i < num_k8_northbridges; i++) { struct pci_dev *dev = k8_northbridges[i]; /* disable gart translation */ - pci_read_config_dword (dev, AMD64_GARTAPERTURECTL, &tmp); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); tmp &= ~AMD64_GARTEN; - pci_write_config_dword (dev, AMD64_GARTAPERTURECTL, tmp); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp); } } @@ -289,9 +278,9 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, u32 nb_order, nb_base; u16 apsize; - pci_read_config_dword(nb, 0x90, &nb_order); + pci_read_config_dword(nb, AMD64_GARTAPERTURECTL, &nb_order); nb_order = (nb_order >> 1) & 7; - pci_read_config_dword(nb, 0x94, &nb_base); + pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base); nb_aper = nb_base << 25; if (aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) { return 0; @@ -327,8 +316,8 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order)) return -1; - pci_write_config_dword(nb, 0x90, order << 1); - pci_write_config_dword(nb, 0x94, aper >> 25); + pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); + pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25); return 0; } diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 248e577..6f22786 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -52,4 +52,21 @@ static inline void gart_iommu_shutdown(void) #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) +{ + u32 tmp, ctl; + + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + #endif -- cgit v1.1 From f784946ded3e734524d1f48a6a8286b8a152c3b9 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Date: Fri, 2 May 2008 16:45:08 -0700 Subject: x86: use per_cpu data in nmi_32 use per_cpu for per CPU data. Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi_32.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 11b14bb..69bdae5 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -293,9 +293,9 @@ void stop_apic_nmi_watchdog(void *unused) * here too!] */ -static unsigned int - last_irq_sums [NR_CPUS], - alert_counter [NR_CPUS]; +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) { @@ -303,12 +303,13 @@ void touch_nmi_watchdog(void) unsigned cpu; /* - * Just reset the alert counters, (other CPUs might be - * spinning on locks we hold): + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. */ for_each_present_cpu(cpu) { - if (alert_counter[cpu]) - alert_counter[cpu] = 0; + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; } } @@ -358,22 +359,26 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) */ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + per_cpu(irq_stat, cpu).irq0_irqs; + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && last_irq_sums[cpu] == sum) { + if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); } else { - last_irq_sums[cpu] = sum; - alert_counter[cpu] = 0; + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); } /* see if the nmi watchdog went off */ if (!__get_cpu_var(wd_enabled)) -- cgit v1.1 From f59a9310b97879159ef4d25227bc270278f1ee71 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Date: Fri, 2 May 2008 16:44:52 -0700 Subject: x86: nmi and irq counters are unsigned in nmi_64.c __nmi_count, apic_timer_irqs and irq0_irqs are unsigned. Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 5a29ded..33cb8ec 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -79,7 +79,7 @@ static __init void nmi_cpu_busy(void *data) int __init check_nmi_watchdog(void) { - int *prev_nmi_count; + unsigned int *prev_nmi_count; int cpu; if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) @@ -316,7 +316,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { - int sum; + unsigned int sum; int touched = 0; int cpu = smp_processor_id(); int rc = 0; -- cgit v1.1 From 622e3f28e50fe30cc199c735440cd7c75e0033b0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sun, 4 May 2008 19:57:19 +0400 Subject: x86: 64-bit defconfig remake The current x86_64_defconfig contains a number of nonexistent symbols. Lets fix it. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/configs/x86_64_defconfig | 627 ++++++++++++++++++++------------------ 1 file changed, 328 insertions(+), 299 deletions(-) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2d6f5b2..2f1b356 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,48 +1,65 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-git14 -# Fri Jul 20 09:53:15 2007 +# Linux kernel version: 2.6.26-rc1 +# Sun May 4 19:27:29 2008 # -CONFIG_X86_64=y CONFIG_64BIT=y +# CONFIG_X86_32 is not set +CONFIG_X86_64=y CONFIG_X86=y +CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" +# CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y -CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CMOS_UPDATE=y -CONFIG_ZONE_DMA32=y +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y -CONFIG_SEMAPHORE_SLEEPERS=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_FAST_CMPXCHG_LOCAL=y CONFIG_MMU=y CONFIG_ZONE_DMA=y -CONFIG_QUICKLIST=y -CONFIG_NR_QUICK=2 -CONFIG_RWSEM_GENERIC_SPINLOCK=y -CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_X86_CMPXCHG=y -CONFIG_EARLY_PRINTK=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_ARCH_POPULATES_NODE_MAP=y -CONFIG_DMI=y -CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_HWEIGHT=y +# CONFIG_GENERIC_GPIO is not set +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y +# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set -CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ZONE_DMA32=y +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_AOUT=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_X86_SMP=y +CONFIG_X86_64_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y +# CONFIG_KTIME_SCALAR is not set # -# Code maturity level options +# General setup # CONFIG_EXPERIMENTAL=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 - -# -# General setup -# CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y @@ -51,14 +68,21 @@ CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 -# CONFIG_CPUSETS is not set -CONFIG_SYSFS_DEPRECATED=y +# CONFIG_CGROUPS is not set +# CONFIG_GROUP_SCHED is not set +# CONFIG_USER_SCHED is not set +# CONFIG_CGROUP_SCHED is not set +# CONFIG_SYSFS_DEPRECATED_V2 is not set CONFIG_RELAY=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +# CONFIG_USER_NS is not set +CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -66,6 +90,7 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y +CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -73,6 +98,7 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_ANON_INODES=y @@ -85,6 +111,17 @@ CONFIG_VM_EVENT_COUNTERS=y CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set +CONFIG_PROFILING=y +# CONFIG_MARKERS is not set +CONFIG_OPROFILE=y +CONFIG_HAVE_OPROFILE=y +CONFIG_KPROBES=y +CONFIG_KRETPROBES=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 @@ -98,6 +135,7 @@ CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set +CONFIG_BLOCK_COMPAT=y # # IO Schedulers @@ -111,107 +149,169 @@ CONFIG_IOSCHED_CFQ=y CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_CLASSIC_RCU=y # # Processor type and features # +CONFIG_TICK_ONESHOT=y +# CONFIG_NO_HZ is not set +CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y +CONFIG_SMP=y CONFIG_X86_PC=y +# CONFIG_X86_ELAN is not set +# CONFIG_X86_VOYAGER is not set +# CONFIG_X86_NUMAQ is not set +# CONFIG_X86_SUMMIT is not set +# CONFIG_X86_BIGSMP is not set +# CONFIG_X86_VISWS is not set +# CONFIG_X86_GENERICARCH is not set +# CONFIG_X86_ES7000 is not set +# CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set +# CONFIG_PARAVIRT_GUEST is not set +# CONFIG_MEMTEST_BOOTPARAM is not set +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMII is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set # CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MEFFICEON is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set +# CONFIG_MGEODE_LX is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set # CONFIG_MCORE2 is not set CONFIG_GENERIC_CPU=y +CONFIG_X86_CPU=y CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_INTERNODE_CACHE_BYTES=128 -CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG=y +CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_GOOD_APIC=y -# CONFIG_MICROCODE is not set -CONFIG_X86_MSR=y -CONFIG_X86_CPUID=y -CONFIG_X86_HT=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_MTRR=y -CONFIG_SMP=y +CONFIG_X86_TSC=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set +CONFIG_SWIOTLB=y +CONFIG_IOMMU_HELPER=y +CONFIG_NR_CPUS=32 CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set -CONFIG_PREEMPT_BKL=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_MCE=y +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +# CONFIG_I8K is not set +# CONFIG_MICROCODE is not set +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y CONFIG_NUMA=y CONFIG_K8_NUMA=y -CONFIG_NODES_SHIFT=6 CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y CONFIG_NUMA_EMU=y +CONFIG_NODES_SHIFT=6 +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +# CONFIG_SPARSEMEM_VMEMMAP is not set + +# +# Memory hotplug is currently incompatible with Software Suspend +# +CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y -CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y -CONFIG_NR_CPUS=32 -CONFIG_PHYSICAL_ALIGN=0x200000 -CONFIG_HOTPLUG_CPU=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_GART_IOMMU=y -# CONFIG_CALGARY_IOMMU is not set -CONFIG_SWIOTLB=y -CONFIG_X86_MCE=y -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -# CONFIG_KEXEC is not set -# CONFIG_CRASH_DUMP is not set -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_START=0x200000 +CONFIG_MTRR=y +# CONFIG_X86_PAT is not set +# CONFIG_EFI is not set CONFIG_SECCOMP=y -# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 -CONFIG_K8_NB=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_ISA_DMA_API=y -CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_SCHED_HRTICK=y +# CONFIG_KEXEC is not set +# CONFIG_CRASH_DUMP is not set +CONFIG_PHYSICAL_START=0x200000 +# CONFIG_RELOCATABLE is not set +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_HOTPLUG_CPU=y +# CONFIG_COMPAT_VDSO is not set +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y # # Power management options # +CONFIG_ARCH_HIBERNATION_HEADER=y CONFIG_PM=y -# CONFIG_PM_LEGACY is not set # CONFIG_PM_DEBUG is not set +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_SLEEP=y +# CONFIG_SUSPEND is not set CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" - -# -# ACPI (Advanced Configuration and Power Interface) Support -# CONFIG_ACPI=y CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_SLEEP_PROC_FS=y -CONFIG_ACPI_SLEEP_PROC_SLEEP=y CONFIG_ACPI_PROCFS=y +CONFIG_ACPI_PROCFS_POWER=y +CONFIG_ACPI_SYSFS_POWER=y +CONFIG_ACPI_PROC_EVENT=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y CONFIG_ACPI_FAN=y -# CONFIG_ACPI_DOCK is not set +CONFIG_ACPI_DOCK=y +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y +# CONFIG_ACPI_WMI is not set # CONFIG_ACPI_ASUS is not set # CONFIG_ACPI_TOSHIBA is not set +# CONFIG_ACPI_CUSTOM_DSDT is not set CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y @@ -230,7 +330,10 @@ CONFIG_CPU_FREQ_DEBUG=y CONFIG_CPU_FREQ_STAT=y # CONFIG_CPU_FREQ_STAT_DETAILS is not set CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y @@ -240,16 +343,19 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers # +CONFIG_X86_ACPI_CPUFREQ=y CONFIG_X86_POWERNOW_K8=y CONFIG_X86_POWERNOW_K8_ACPI=y # CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_ACPI_CPUFREQ=y +CONFIG_X86_P4_CLOCKMOD=y # # shared options # CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y -# CONFIG_X86_SPEEDSTEP_LIB is not set +CONFIG_X86_SPEEDSTEP_LIB=y +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y # # Bus options (PCI etc.) @@ -257,16 +363,18 @@ CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y CONFIG_PCI=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set CONFIG_PCIEPORTBUS=y CONFIG_PCIEAER=y +# CONFIG_PCIEASPM is not set CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y +CONFIG_PCI_LEGACY=y # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set - -# -# PCCARD (PCMCIA/CardBus) support -# +CONFIG_ISA_DMA_API=y +CONFIG_K8_NB=y # CONFIG_PCCARD is not set # CONFIG_HOTPLUG_PCI is not set @@ -274,10 +382,12 @@ CONFIG_PCI_MSI=y # Executable file formats / Emulations # CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y # CONFIG_BINFMT_MISC is not set CONFIG_IA32_EMULATION=y CONFIG_IA32_AOUT=y CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_SYSVIPC_COMPAT=y # @@ -313,6 +423,7 @@ CONFIG_INET_TUNNEL=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -334,8 +445,10 @@ CONFIG_IPV6=y # CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y +CONFIG_IPV6_NDISC_NODETYPE=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set +# CONFIG_IPV6_MROUTE is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set # CONFIG_IP_DCCP is not set @@ -352,10 +465,6 @@ CONFIG_IPV6_SIT=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# # CONFIG_NET_SCHED is not set # @@ -364,6 +473,7 @@ CONFIG_IPV6_SIT=y # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set +# CONFIG_CAN is not set # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set @@ -385,6 +495,7 @@ CONFIG_IPV6_SIT=y # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y @@ -416,7 +527,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 -CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 +# CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set CONFIG_MISC_DEVICES=y @@ -427,17 +538,20 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set +# CONFIG_INTEL_MENLOW is not set +# CONFIG_ENCLOSURE_SERVICES is not set +CONFIG_HAVE_IDE=y CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y # -# Please see Documentation/ide.txt for help/info on IDE drives +# Please see Documentation/ide/ide.txt for help/info on IDE drives # # CONFIG_BLK_DEV_IDE_SATA is not set -# CONFIG_BLK_DEV_HD_IDE is not set CONFIG_BLK_DEV_IDEDISK=y CONFIG_IDEDISK_MULTI_MODE=y CONFIG_BLK_DEV_IDECD=y +CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set @@ -449,18 +563,21 @@ CONFIG_IDE_PROC_FS=y # IDE chipset support/bugfixes # CONFIG_IDE_GENERIC=y +# CONFIG_BLK_DEV_PLATFORM is not set # CONFIG_BLK_DEV_CMD640 is not set # CONFIG_BLK_DEV_IDEPNP is not set +CONFIG_BLK_DEV_IDEDMA_SFF=y + +# +# PCI IDE chipsets support +# CONFIG_BLK_DEV_IDEPCI=y -# CONFIG_IDEPCI_SHARE_IRQ is not set CONFIG_IDEPCI_PCIBUS_ORDER=y # CONFIG_BLK_DEV_OFFBOARD is not set # CONFIG_BLK_DEV_GENERIC is not set # CONFIG_BLK_DEV_OPTI621 is not set # CONFIG_BLK_DEV_RZ1000 is not set CONFIG_BLK_DEV_IDEDMA_PCI=y -# CONFIG_BLK_DEV_IDEDMA_FORCED is not set -# CONFIG_IDEDMA_ONLYDISK is not set # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_BLK_DEV_ALI15X3 is not set CONFIG_BLK_DEV_AMD74XX=y @@ -487,9 +604,8 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set # CONFIG_BLK_DEV_TC86C001 is not set -# CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_IDEDMA_IVB is not set +# CONFIG_BLK_DEV_HD_ONLY is not set # CONFIG_BLK_DEV_HD is not set # @@ -528,105 +644,16 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set -CONFIG_SCSI_SAS_ATTRS=y # CONFIG_SCSI_SAS_LIBSAS is not set - -# -# SCSI low-level drivers -# -# CONFIG_ISCSI_TCP is not set -# CONFIG_BLK_DEV_3W_XXXX_RAID is not set -# CONFIG_SCSI_3W_9XXX is not set -# CONFIG_SCSI_ACARD is not set -# CONFIG_SCSI_AACRAID is not set -# CONFIG_SCSI_AIC7XXX is not set -# CONFIG_SCSI_AIC7XXX_OLD is not set -CONFIG_SCSI_AIC79XX=y -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=4000 -# CONFIG_AIC79XX_DEBUG_ENABLE is not set -CONFIG_AIC79XX_DEBUG_MASK=0 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -# CONFIG_SCSI_AIC94XX is not set -# CONFIG_SCSI_ARCMSR is not set -# CONFIG_MEGARAID_NEWGEN is not set -# CONFIG_MEGARAID_LEGACY is not set -# CONFIG_MEGARAID_SAS is not set -# CONFIG_SCSI_HPTIOP is not set -# CONFIG_SCSI_BUSLOGIC is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_FUTURE_DOMAIN is not set -# CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_STEX is not set -# CONFIG_SCSI_SYM53C8XX_2 is not set -# CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_1280 is not set -# CONFIG_SCSI_QLA_FC is not set -# CONFIG_SCSI_QLA_ISCSI is not set -# CONFIG_SCSI_LPFC is not set -# CONFIG_SCSI_DC395x is not set -# CONFIG_SCSI_DC390T is not set -# CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_SRP is not set +# CONFIG_SCSI_SRP_ATTRS is not set +# CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y +# CONFIG_SATA_PMP is not set CONFIG_SATA_AHCI=y -CONFIG_SATA_SVW=y -CONFIG_ATA_PIIX=y -# CONFIG_SATA_MV is not set -CONFIG_SATA_NV=y -# CONFIG_PDC_ADMA is not set -# CONFIG_SATA_QSTOR is not set -# CONFIG_SATA_PROMISE is not set -# CONFIG_SATA_SX4 is not set -CONFIG_SATA_SIL=y # CONFIG_SATA_SIL24 is not set -# CONFIG_SATA_SIS is not set -# CONFIG_SATA_ULI is not set -CONFIG_SATA_VIA=y -# CONFIG_SATA_VITESSE is not set -# CONFIG_SATA_INIC162X is not set -# CONFIG_PATA_ALI is not set -# CONFIG_PATA_AMD is not set -# CONFIG_PATA_ARTOP is not set -# CONFIG_PATA_ATIIXP is not set -# CONFIG_PATA_CMD640_PCI is not set -# CONFIG_PATA_CMD64X is not set -# CONFIG_PATA_CS5520 is not set -# CONFIG_PATA_CS5530 is not set -# CONFIG_PATA_CYPRESS is not set -# CONFIG_PATA_EFAR is not set -# CONFIG_ATA_GENERIC is not set -# CONFIG_PATA_HPT366 is not set -# CONFIG_PATA_HPT37X is not set -# CONFIG_PATA_HPT3X2N is not set -# CONFIG_PATA_HPT3X3 is not set -# CONFIG_PATA_IT821X is not set -# CONFIG_PATA_IT8213 is not set -# CONFIG_PATA_JMICRON is not set -# CONFIG_PATA_TRIFLEX is not set -# CONFIG_PATA_MARVELL is not set -# CONFIG_PATA_MPIIX is not set -# CONFIG_PATA_OLDPIIX is not set -# CONFIG_PATA_NETCELL is not set -# CONFIG_PATA_NS87410 is not set -# CONFIG_PATA_OPTI is not set -# CONFIG_PATA_OPTIDMA is not set -# CONFIG_PATA_PDC_OLD is not set -# CONFIG_PATA_RADISYS is not set -# CONFIG_PATA_RZ1000 is not set -# CONFIG_PATA_SC1200 is not set -# CONFIG_PATA_SERVERWORKS is not set -# CONFIG_PATA_PDC2027X is not set -# CONFIG_PATA_SIL680 is not set -# CONFIG_PATA_SIS is not set -# CONFIG_PATA_VIA is not set -# CONFIG_PATA_WINBOND is not set +# CONFIG_ATA_SFF is not set CONFIG_MD=y # CONFIG_BLK_DEV_MD is not set CONFIG_BLK_DEV_DM=y @@ -637,16 +664,14 @@ CONFIG_BLK_DEV_DM=y # CONFIG_DM_ZERO is not set # CONFIG_DM_MULTIPATH is not set # CONFIG_DM_DELAY is not set - -# -# Fusion MPT device support -# +# CONFIG_DM_UEVENT is not set CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set # CONFIG_FUSION_SAS is not set CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set +# CONFIG_FUSION_LOGGING is not set # # IEEE 1394 (FireWire) support @@ -687,6 +712,7 @@ CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set CONFIG_TUN=y +# CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set @@ -709,15 +735,21 @@ CONFIG_TULIP=y # CONFIG_DM9102 is not set # CONFIG_ULI526X is not set # CONFIG_HP100 is not set +# CONFIG_IBM_NEW_EMAC_ZMII is not set +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set CONFIG_AMD8111_ETH=y # CONFIG_AMD8111E_NAPI is not set # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y CONFIG_FORCEDETH=y # CONFIG_FORCEDETH_NAPI is not set -# CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y # CONFIG_FEALNX is not set @@ -729,6 +761,7 @@ CONFIG_8139TOO=y # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set # CONFIG_8139_OLD_RX_RESET is not set +# CONFIG_R6040 is not set # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set @@ -740,6 +773,10 @@ CONFIG_NETDEV_1000=y CONFIG_E1000=y # CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set +# CONFIG_E1000E is not set +# CONFIG_E1000E_ENABLED is not set +# CONFIG_IP1000 is not set +# CONFIG_IGB is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -755,12 +792,17 @@ CONFIG_BNX2=y CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set +# CONFIG_IXGBE is not set # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set +# CONFIG_NIU is not set # CONFIG_MLX4_CORE is not set +# CONFIG_TEHUTI is not set +# CONFIG_BNX2X is not set +# CONFIG_SFC is not set # CONFIG_TR is not set # @@ -768,6 +810,8 @@ CONFIG_S2IO=m # # CONFIG_WLAN_PRE80211 is not set # CONFIG_WLAN_80211 is not set +# CONFIG_IWLWIFI is not set +# CONFIG_IWLWIFI_LEDS is not set # # USB Network Adapters @@ -776,7 +820,6 @@ CONFIG_S2IO=m # CONFIG_USB_KAWETH is not set # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set -# CONFIG_USB_USBNET_MII is not set # CONFIG_USB_USBNET is not set # CONFIG_WAN is not set # CONFIG_FDDI is not set @@ -784,8 +827,8 @@ CONFIG_S2IO=m # CONFIG_PPP is not set # CONFIG_SLIP is not set # CONFIG_NET_FC is not set -# CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y +# CONFIG_NETCONSOLE_DYNAMIC is not set CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -807,7 +850,6 @@ CONFIG_INPUT_MOUSEDEV_PSAUX=y CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_JOYDEV is not set -# CONFIG_INPUT_TSDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_EVBUG is not set @@ -856,7 +898,9 @@ CONFIG_VT=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y # CONFIG_VT_HW_CONSOLE_BINDING is not set +# CONFIG_DEVKMEM is not set # CONFIG_SERIAL_NONSTANDARD is not set +# CONFIG_NOZOMI is not set # # Serial drivers @@ -880,7 +924,6 @@ CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_IPMI_HANDLER is not set -# CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y @@ -888,12 +931,6 @@ CONFIG_HW_RANDOM_AMD=y CONFIG_RTC=y # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set -CONFIG_AGP=y -CONFIG_AGP_AMD64=y -CONFIG_AGP_INTEL=y -# CONFIG_AGP_SIS is not set -# CONFIG_AGP_VIA is not set -# CONFIG_DRM is not set # CONFIG_MWAVE is not set # CONFIG_PC8736x_GPIO is not set CONFIG_RAW_DRIVER=y @@ -906,40 +943,69 @@ CONFIG_HPET_MMAP=y # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y # CONFIG_I2C is not set - -# -# SPI support -# # CONFIG_SPI is not set -# CONFIG_SPI_MASTER is not set # CONFIG_W1 is not set -# CONFIG_POWER_SUPPLY is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +# CONFIG_PDA_POWER is not set +# CONFIG_BATTERY_DS2760 is not set # CONFIG_HWMON is not set +CONFIG_THERMAL=y +# CONFIG_WATCHDOG is not set + +# +# Sonics Silicon Backplane +# +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=y +CONFIG_SSB_SPROM=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +# CONFIG_SSB_B43_PCI_BRIDGE is not set +# CONFIG_SSB_DEBUG is not set +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y # # Multifunction device drivers # # CONFIG_MFD_SM501 is not set +# CONFIG_HTC_PASIC3 is not set # # Multimedia devices # + +# +# Multimedia core support +# # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set + +# +# Multimedia drivers +# CONFIG_DAB=y # CONFIG_USB_DABUSB is not set # # Graphics support # +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +# CONFIG_AGP_SIS is not set +# CONFIG_AGP_VIA is not set +# CONFIG_DRM is not set +# CONFIG_VGASTATE is not set +# CONFIG_VIDEO_OUTPUT_CONTROL is not set +# CONFIG_FB is not set # CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Display device support # # CONFIG_DISPLAY_SUPPORT is not set -# CONFIG_VGASTATE is not set -# CONFIG_FB is not set # # Console display driver support @@ -971,6 +1037,7 @@ CONFIG_SOUND_PRIME=y CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set +CONFIG_HIDRAW=y # # USB Input Devices @@ -985,6 +1052,7 @@ CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y CONFIG_USB=y # CONFIG_USB_DEBUG is not set +# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set # # Miscellaneous USB options @@ -993,18 +1061,19 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # # USB Host Controller Drivers # +# CONFIG_USB_C67X00_HCD is not set CONFIG_USB_EHCI_HCD=y -# CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_ISP116X_HCD is not set +# CONFIG_USB_ISP1760_HCD is not set CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_HCD_SSB is not set # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set # CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y @@ -1036,7 +1105,9 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_SDDR55 is not set # CONFIG_USB_STORAGE_JUMPSHOT is not set # CONFIG_USB_STORAGE_ALAUDA is not set +# CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set # CONFIG_USB_LIBUSUAL is not set # @@ -1049,10 +1120,6 @@ CONFIG_USB_MON=y # # USB port drivers # - -# -# USB Serial Converter support -# # CONFIG_USB_SERIAL is not set # @@ -1078,55 +1145,20 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set - -# -# USB DSL modem support -# - -# -# USB Gadget Support -# # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set - -# -# LED devices -# +# CONFIG_MEMSTICK is not set # CONFIG_NEW_LEDS is not set - -# -# LED drivers -# - -# -# LED Triggers -# +# CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set # CONFIG_EDAC is not set - -# -# Real Time Clock -# # CONFIG_RTC_CLASS is not set - -# -# DMA Engine support -# -# CONFIG_DMA_ENGINE is not set - -# -# DMA Clients -# +CONFIG_DMADEVICES=y # # DMA Devices # -CONFIG_VIRTUALIZATION=y -# CONFIG_KVM is not set - -# -# Userspace I/O -# +# CONFIG_INTEL_IOATDMA is not set # CONFIG_UIO is not set # @@ -1136,6 +1168,7 @@ CONFIG_VIRTUALIZATION=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y +# CONFIG_ISCSI_IBFT_FIND is not set # # File systems @@ -1164,12 +1197,10 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_ROMFS_FS is not set +CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set -CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set @@ -1204,7 +1235,6 @@ CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set # @@ -1219,48 +1249,19 @@ CONFIG_RAMFS=y # CONFIG_EFS_FS is not set # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set +# CONFIG_MINIX_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set +# CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -# CONFIG_NFS_V3_ACL is not set -# CONFIG_NFS_V4 is not set -# CONFIG_NFS_DIRECTIO is not set -CONFIG_NFSD=y -CONFIG_NFSD_V3=y -# CONFIG_NFSD_V3_ACL is not set -# CONFIG_NFSD_V4 is not set -CONFIG_NFSD_TCP=y -CONFIG_ROOT_NFS=y -CONFIG_LOCKD=y -CONFIG_LOCKD_V4=y -CONFIG_EXPORTFS=y -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=y -# CONFIG_SUNRPC_BIND34 is not set -# CONFIG_RPCSEC_GSS_KRB5 is not set -# CONFIG_RPCSEC_GSS_SPKM3 is not set -# CONFIG_SMB_FS is not set -# CONFIG_CIFS is not set -# CONFIG_NCP_FS is not set -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set +# CONFIG_NETWORK_FILESYSTEMS is not set # # Partition Types # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# CONFIG_NLS=y CONFIG_NLS_DEFAULT="iso8859-1" CONFIG_NLS_CODEPAGE_437=y @@ -1301,25 +1302,16 @@ CONFIG_NLS_ISO8859_15=y # CONFIG_NLS_KOI8_R is not set # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y - -# -# Distributed Lock Manager -# # CONFIG_DLM is not set # -# Instrumentation Support -# -CONFIG_PROFILING=y -CONFIG_OPROFILE=y -CONFIG_KPROBES=y - -# # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set +CONFIG_ENABLE_WARN_DEPRECATED=y # CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_FS=y @@ -1330,6 +1322,7 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set CONFIG_TIMER_STATS=y +# CONFIG_DEBUG_OBJECTS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set @@ -1344,28 +1337,64 @@ CONFIG_TIMER_STATS=y CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_WRITECOUNT is not set # CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_SG is not set # CONFIG_FRAME_POINTER is not set -CONFIG_OPTIMIZE_INLINING=y +# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set -# CONFIG_DEBUG_RODATA is not set -# CONFIG_IOMMU_DEBUG is not set +# CONFIG_LATENCYTOP is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +# CONFIG_KGDB is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_NONPROMISC_DEVMEM is not set +CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_PER_CPU_MAPS is not set +# CONFIG_X86_PTDUMP is not set +# CONFIG_DEBUG_RODATA is not set +# CONFIG_DIRECT_GBPAGES is not set +# CONFIG_DEBUG_NX_TEST is not set +CONFIG_X86_MPPARSE=y +# CONFIG_IOMMU_DEBUG is not set +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +# CONFIG_DEBUG_BOOT_PARAMS is not set +# CONFIG_CPA_DEBUG is not set # # Security options # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_HAVE_KVM is not set +# CONFIG_VIRTUALIZATION is not set +# CONFIG_KVM is not set +# CONFIG_VIRTIO_PCI is not set +# CONFIG_VIRTIO_BALLOON is not set # # Library routines # CONFIG_BITREVERSE=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set -- cgit v1.1 From b5d958ea66ac11b1190c24f042f517adf9229a98 Mon Sep 17 00:00:00 2001 From: Auke Kok <auke-jan.h.kok@intel.com> Date: Wed, 9 Apr 2008 13:17:39 -0700 Subject: e1000e: set CONFIG_E1000E=y in x86 defconfigs This adds to the already default CONFIG_E1000=y in these files. Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/configs/i386_defconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ad7ddaa..73f2c3c 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -814,6 +814,10 @@ CONFIG_NETDEV_1000=y CONFIG_E1000=y # CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set +CONFIG_E1000E=y +CONFIG_E1000E_ENABLED=y +# CONFIG_IP1000 is not set +# CONFIG_IGB is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set -- cgit v1.1 From 5cb04df8d3f03e37a19f2502591a84156be71772 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sun, 4 May 2008 19:49:04 +0200 Subject: x86: defconfig updates refresh 32-bit defconfig too, and update the 64-bit configs as well, the defconfig should be much more useful by default, so most of the updates are the enabling of various options. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/configs/i386_defconfig | 1711 ++++++++++++++++++++++++++----------- arch/x86/configs/x86_64_defconfig | 1378 ++++++++++++++++++++++------- 2 files changed, 2291 insertions(+), 798 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 73f2c3c..9bc34e2 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,54 +1,103 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-git14 -# Fri Jul 20 09:53:15 2007 +# Linux kernel version: 2.6.26-rc1 +# Sun May 4 19:59:02 2008 # +# CONFIG_64BIT is not set CONFIG_X86_32=y +# CONFIG_X86_64 is not set +CONFIG_X86=y +CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" +# CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_CMOS_UPDATE=y CONFIG_CLOCKSOURCE_WATCHDOG=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y -CONFIG_SEMAPHORE_SLEEPERS=y -CONFIG_X86=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_FAST_CMPXCHG_LOCAL=y CONFIG_MMU=y CONFIG_ZONE_DMA=y -CONFIG_QUICKLIST=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_GENERIC_BUG=y CONFIG_GENERIC_HWEIGHT=y +# CONFIG_GENERIC_GPIO is not set CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_DMI=y -CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +# CONFIG_RWSEM_GENERIC_SPINLOCK is not set +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set +CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +# CONFIG_GENERIC_TIME_VSYSCALL is not set +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +# CONFIG_ZONE_DMA32 is not set +CONFIG_ARCH_POPULATES_NODE_MAP=y +# CONFIG_AUDIT_ARCH is not set +CONFIG_ARCH_SUPPORTS_AOUT=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_X86_SMP=y +CONFIG_X86_32_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y +CONFIG_KTIME_SCALAR=y # -# Code maturity level options +# General setup # CONFIG_EXPERIMENTAL=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 - -# -# General setup -# CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y +# CONFIG_LOCALVERSION_AUTO is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y -# CONFIG_BSD_PROCESS_ACCT is not set -# CONFIG_TASKSTATS is not set -# CONFIG_USER_NS is not set -# CONFIG_AUDIT is not set -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=18 -# CONFIG_CPUSETS is not set -CONFIG_SYSFS_DEPRECATED=y +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_AUDIT_TREE=y +# CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_CGROUPS=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_CGROUP_NS=y +# CONFIG_CGROUP_DEVICE is not set +CONFIG_CPUSETS=y +CONFIG_GROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +# CONFIG_RT_GROUP_SCHED is not set +# CONFIG_USER_SCHED is not set +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_RESOURCE_COUNTERS=y +# CONFIG_CGROUP_MEM_RES_CTLR is not set +# CONFIG_SYSFS_DEPRECATED_V2 is not set +CONFIG_PROC_PID_CPUSET=y CONFIG_RELAY=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -56,13 +105,15 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y +CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y -# CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_ANON_INODES=y @@ -76,6 +127,17 @@ CONFIG_SLUB_DEBUG=y # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set +CONFIG_PROFILING=y +CONFIG_MARKERS=y +# CONFIG_OPROFILE is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_KPROBES=y +CONFIG_KRETPROBES=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 @@ -87,10 +149,10 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -CONFIG_LBD=y -# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LBD is not set +CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_LSF is not set -# CONFIG_BLK_DEV_BSG is not set +CONFIG_BLK_DEV_BSG=y # # IO Schedulers @@ -103,7 +165,8 @@ CONFIG_IOSCHED_CFQ=y # CONFIG_DEFAULT_DEADLINE is not set CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="anticipatory" +CONFIG_DEFAULT_IOSCHED="cfq" +CONFIG_CLASSIC_RCU=y # # Processor type and features @@ -111,18 +174,21 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y -# CONFIG_X86_PC is not set +CONFIG_X86_PC=y # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set # CONFIG_X86_NUMAQ is not set # CONFIG_X86_SUMMIT is not set # CONFIG_X86_BIGSMP is not set # CONFIG_X86_VISWS is not set -CONFIG_X86_GENERICARCH=y +# CONFIG_X86_GENERICARCH is not set # CONFIG_X86_ES7000 is not set -# CONFIG_PARAVIRT is not set -CONFIG_X86_CYCLONE_TIMER=y +# CONFIG_X86_RDC321X is not set +# CONFIG_X86_VSMP is not set +CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y +# CONFIG_PARAVIRT_GUEST is not set # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -130,9 +196,8 @@ CONFIG_X86_CYCLONE_TIMER=y # CONFIG_M586MMX is not set # CONFIG_M686 is not set # CONFIG_MPENTIUMII is not set -CONFIG_MPENTIUMIII=y +# CONFIG_MPENTIUMIII is not set # CONFIG_MPENTIUMM is not set -# CONFIG_MCORE2 is not set # CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set # CONFIG_MK7 is not set @@ -147,14 +212,14 @@ CONFIG_MPENTIUMIII=y # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set -CONFIG_X86_GENERIC=y +# CONFIG_MPSC is not set +CONFIG_MCORE2=y +# CONFIG_GENERIC_CPU is not set +# CONFIG_X86_GENERIC is not set +CONFIG_X86_CPU=y CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_XADD=y -CONFIG_RWSEM_XCHGADD_ALGORITHM=y -# CONFIG_ARCH_HAS_ILOG2_U32 is not set -# CONFIG_ARCH_HAS_ILOG2_U64 is not set -CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y @@ -162,106 +227,120 @@ CONFIG_X86_POPAD_OK=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=4 +CONFIG_X86_MINIMUM_CPU_FAMILY=6 +CONFIG_X86_DEBUGCTLMSR=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y -CONFIG_NR_CPUS=32 -CONFIG_SCHED_SMT=y +CONFIG_DMI=y +# CONFIG_IOMMU_HELPER is not set +CONFIG_NR_CPUS=4 +# CONFIG_SCHED_SMT is not set CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set -CONFIG_PREEMPT_BKL=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y -CONFIG_X86_MCE=y -CONFIG_X86_MCE_NONFATAL=y -CONFIG_X86_MCE_P4THERMAL=y +# CONFIG_X86_MCE is not set CONFIG_VM86=y # CONFIG_TOSHIBA is not set # CONFIG_I8K is not set # CONFIG_X86_REBOOTFIXUPS is not set -CONFIG_MICROCODE=y -CONFIG_MICROCODE_OLD_INTERFACE=y +# CONFIG_MICROCODE is not set CONFIG_X86_MSR=y CONFIG_X86_CPUID=y - -# -# Firmware Drivers -# -# CONFIG_EDD is not set -# CONFIG_DELL_RBU is not set -# CONFIG_DCDBAS is not set -CONFIG_DMIID=y # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set CONFIG_PAGE_OFFSET=0xC0000000 CONFIG_HIGHMEM=y -CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_NEED_NODE_MEMMAP_SIZE=y +CONFIG_ARCH_FLATMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_FLATMEM_MANUAL=y +# CONFIG_FLATMEM_MANUAL is not set # CONFIG_DISCONTIGMEM_MANUAL is not set -# CONFIG_SPARSEMEM_MANUAL is not set -CONFIG_FLATMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y -# CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_STATIC=y +# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set + +# +# Memory hotplug is currently incompatible with Software Suspend +# +CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y -CONFIG_NR_QUICK=1 CONFIG_VIRT_TO_BUS=y # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y -# CONFIG_EFI is not set +# CONFIG_X86_PAT is not set +CONFIG_EFI=y # CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 -# CONFIG_KEXEC is not set -# CONFIG_CRASH_DUMP is not set -CONFIG_PHYSICAL_START=0x100000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x100000 -# CONFIG_HOTPLUG_CPU is not set -CONFIG_COMPAT_VDSO=y +CONFIG_HZ_1000=y +CONFIG_HZ=1000 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_HOTPLUG_CPU=y +# CONFIG_COMPAT_VDSO is not set CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y # -# Power management options (ACPI, APM) +# Power management options # CONFIG_PM=y -CONFIG_PM_LEGACY=y -# CONFIG_PM_DEBUG is not set - -# -# ACPI (Advanced Configuration and Power Interface) Support -# +CONFIG_PM_DEBUG=y +# CONFIG_PM_VERBOSE is not set +CONFIG_CAN_PM_TRACE=y +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_SLEEP=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" CONFIG_ACPI=y +CONFIG_ACPI_SLEEP=y CONFIG_ACPI_PROCFS=y +CONFIG_ACPI_PROCFS_POWER=y +CONFIG_ACPI_SYSFS_POWER=y +CONFIG_ACPI_PROC_EVENT=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y CONFIG_ACPI_FAN=y -# CONFIG_ACPI_DOCK is not set +CONFIG_ACPI_DOCK=y +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y +# CONFIG_ACPI_WMI is not set # CONFIG_ACPI_ASUS is not set # CONFIG_ACPI_TOSHIBA is not set -CONFIG_ACPI_BLACKLIST_YEAR=2001 -CONFIG_ACPI_DEBUG=y +# CONFIG_ACPI_CUSTOM_DSDT is not set +CONFIG_ACPI_BLACKLIST_YEAR=0 +# CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y -# CONFIG_ACPI_CONTAINER is not set +CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set # CONFIG_APM is not set @@ -271,15 +350,17 @@ CONFIG_X86_PM_TIMER=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_TABLE=y CONFIG_CPU_FREQ_DEBUG=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_STAT_DETAILS is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_STAT is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set # # CPUFreq processor drivers @@ -287,8 +368,7 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_X86_ACPI_CPUFREQ=y # CONFIG_X86_POWERNOW_K6 is not set # CONFIG_X86_POWERNOW_K7 is not set -CONFIG_X86_POWERNOW_K8=y -CONFIG_X86_POWERNOW_K8_ACPI=y +# CONFIG_X86_POWERNOW_K8 is not set # CONFIG_X86_GX_SUSPMOD is not set # CONFIG_X86_SPEEDSTEP_CENTRINO is not set # CONFIG_X86_SPEEDSTEP_ICH is not set @@ -302,43 +382,72 @@ CONFIG_X86_POWERNOW_K8_ACPI=y # # shared options # -CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y +# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set # CONFIG_X86_SPEEDSTEP_LIB is not set +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y # -# Bus options (PCI, PCMCIA, EISA, MCA, ISA) +# Bus options (PCI etc.) # CONFIG_PCI=y # CONFIG_PCI_GOBIOS is not set # CONFIG_PCI_GOMMCONFIG is not set # CONFIG_PCI_GODIRECT is not set CONFIG_PCI_GOANY=y +# CONFIG_PCI_GOOLPC is not set CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y -# CONFIG_PCIEPORTBUS is not set +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +# CONFIG_HOTPLUG_PCI_PCIE is not set +CONFIG_PCIEAER=y +# CONFIG_PCIEASPM is not set CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y +# CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set -# CONFIG_HT_IRQ is not set +CONFIG_HT_IRQ=y CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set # CONFIG_SCx200 is not set +# CONFIG_OLPC is not set CONFIG_K8_NB=y - -# -# PCCARD (PCMCIA/CardBus) support -# -# CONFIG_PCCARD is not set -# CONFIG_HOTPLUG_PCI is not set - -# -# Executable file formats +CONFIG_PCCARD=y +# CONFIG_PCMCIA_DEBUG is not set +CONFIG_PCMCIA=y +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_PCMCIA_IOCTL=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=y +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +# CONFIG_PD6729 is not set +# CONFIG_I82092 is not set +CONFIG_PCCARD_NONSTATIC=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_HOTPLUG_PCI_FAKE is not set +# CONFIG_HOTPLUG_PCI_IBM is not set +# CONFIG_HOTPLUG_PCI_ACPI is not set +# CONFIG_HOTPLUG_PCI_CPCI is not set +# CONFIG_HOTPLUG_PCI_SHPC is not set + +# +# Executable file formats / Emulations # CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_AOUT is not set -# CONFIG_BINFMT_MISC is not set +CONFIG_BINFMT_MISC=y # # Networking @@ -349,59 +458,142 @@ CONFIG_NET=y # Networking options # CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set +CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y -# CONFIG_XFRM_USER is not set +CONFIG_XFRM_USER=y # CONFIG_XFRM_SUB_POLICY is not set # CONFIG_XFRM_MIGRATE is not set +# CONFIG_XFRM_STATISTICS is not set # CONFIG_NET_KEY is not set CONFIG_INET=y CONFIG_IP_MULTICAST=y -# CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_ASK_IP_FIB_HASH=y +# CONFIG_IP_FIB_TRIE is not set CONFIG_IP_FIB_HASH=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +# CONFIG_IP_PNP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set -# CONFIG_IP_MROUTE is not set +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y # CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set +CONFIG_SYN_COOKIES=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set CONFIG_INET_TUNNEL=y -CONFIG_INET_XFRM_MODE_TRANSPORT=y -CONFIG_INET_XFRM_MODE_TUNNEL=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -CONFIG_INET_DIAG=y -CONFIG_INET_TCP_DIAG=y -# CONFIG_TCP_CONG_ADVANCED is not set +CONFIG_INET_LRO=y +# CONFIG_INET_DIAG is not set +CONFIG_TCP_CONG_ADVANCED=y +# CONFIG_TCP_CONG_BIC is not set CONFIG_TCP_CONG_CUBIC=y +# CONFIG_TCP_CONG_WESTWOOD is not set +# CONFIG_TCP_CONG_HTCP is not set +# CONFIG_TCP_CONG_HSTCP is not set +# CONFIG_TCP_CONG_HYBLA is not set +# CONFIG_TCP_CONG_VEGAS is not set +# CONFIG_TCP_CONG_SCALABLE is not set +# CONFIG_TCP_CONG_LP is not set +# CONFIG_TCP_CONG_VENO is not set +# CONFIG_TCP_CONG_YEAH is not set +# CONFIG_TCP_CONG_ILLINOIS is not set +# CONFIG_DEFAULT_BIC is not set +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_HTCP is not set +# CONFIG_DEFAULT_VEGAS is not set +# CONFIG_DEFAULT_WESTWOOD is not set +# CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" -# CONFIG_TCP_MD5SIG is not set +CONFIG_TCP_MD5SIG=y +# CONFIG_IP_VS is not set CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set # CONFIG_IPV6_OPTIMISTIC_DAD is not set -# CONFIG_INET6_AH is not set -# CONFIG_INET6_ESP is not set +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y # CONFIG_INET6_IPCOMP is not set # CONFIG_IPV6_MIP6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y -# CONFIG_INET6_XFRM_MODE_BEET is not set +CONFIG_INET6_XFRM_MODE_BEET=y # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y +CONFIG_IPV6_NDISC_NODETYPE=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set -# CONFIG_NETWORK_SECMARK is not set -# CONFIG_NETFILTER is not set +# CONFIG_IPV6_MROUTE is not set +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +# CONFIG_NETFILTER_ADVANCED is not set + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_NETLINK=y +CONFIG_NETFILTER_NETLINK_LOG=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_SIP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XTABLES=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_STATE=y + +# +# IP: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_PROC_COMPAT=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_TARGET_LOG=y +CONFIG_IP_NF_TARGET_ULOG=y +CONFIG_NF_NAT=y +CONFIG_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_NF_NAT_FTP=y +CONFIG_NF_NAT_IRC=y +# CONFIG_NF_NAT_TFTP is not set +# CONFIG_NF_NAT_AMANDA is not set +# CONFIG_NF_NAT_PPTP is not set +# CONFIG_NF_NAT_H323 is not set +CONFIG_NF_NAT_SIP=y +CONFIG_IP_NF_MANGLE=y + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_IPV6HEADER=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_LOG=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y # CONFIG_IP_DCCP is not set # CONFIG_IP_SCTP is not set # CONFIG_TIPC is not set @@ -409,6 +601,7 @@ CONFIG_IPV6_SIT=y # CONFIG_BRIDGE is not set # CONFIG_VLAN_8021Q is not set # CONFIG_DECNET is not set +CONFIG_LLC=y # CONFIG_LLC2 is not set # CONFIG_IPX is not set # CONFIG_ATALK is not set @@ -416,28 +609,99 @@ CONFIG_IPV6_SIT=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +# CONFIG_NET_SCH_CBQ is not set +# CONFIG_NET_SCH_HTB is not set +# CONFIG_NET_SCH_HFSC is not set +# CONFIG_NET_SCH_PRIO is not set +# CONFIG_NET_SCH_RR is not set +# CONFIG_NET_SCH_RED is not set +# CONFIG_NET_SCH_SFQ is not set +# CONFIG_NET_SCH_TEQL is not set +# CONFIG_NET_SCH_TBF is not set +# CONFIG_NET_SCH_GRED is not set +# CONFIG_NET_SCH_DSMARK is not set +# CONFIG_NET_SCH_NETEM is not set +# CONFIG_NET_SCH_INGRESS is not set + +# +# Classification +# +CONFIG_NET_CLS=y +# CONFIG_NET_CLS_BASIC is not set +# CONFIG_NET_CLS_TCINDEX is not set +# CONFIG_NET_CLS_ROUTE4 is not set +# CONFIG_NET_CLS_FW is not set +# CONFIG_NET_CLS_U32 is not set +# CONFIG_NET_CLS_RSVP is not set +# CONFIG_NET_CLS_RSVP6 is not set +# CONFIG_NET_CLS_FLOW is not set +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +# CONFIG_NET_EMATCH_CMP is not set +# CONFIG_NET_EMATCH_NBYTE is not set +# CONFIG_NET_EMATCH_U32 is not set +# CONFIG_NET_EMATCH_META is not set +# CONFIG_NET_EMATCH_TEXT is not set +CONFIG_NET_CLS_ACT=y +# CONFIG_NET_ACT_POLICE is not set +# CONFIG_NET_ACT_GACT is not set +# CONFIG_NET_ACT_MIRRED is not set +# CONFIG_NET_ACT_IPT is not set +# CONFIG_NET_ACT_NAT is not set +# CONFIG_NET_ACT_PEDIT is not set +# CONFIG_NET_ACT_SIMP is not set +CONFIG_NET_SCH_FIFO=y # # Network testing # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set -# CONFIG_HAMRADIO is not set +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +# CONFIG_AX25 is not set +# CONFIG_CAN is not set # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set +CONFIG_FIB_RULES=y # # Wireless # -# CONFIG_CFG80211 is not set -# CONFIG_WIRELESS_EXT is not set -# CONFIG_MAC80211 is not set +CONFIG_CFG80211=y +CONFIG_NL80211=y +CONFIG_WIRELESS_EXT=y +CONFIG_MAC80211=y + +# +# Rate control algorithm selection +# +CONFIG_MAC80211_RC_DEFAULT_PID=y +# CONFIG_MAC80211_RC_DEFAULT_NONE is not set + +# +# Selecting 'y' for an algorithm will +# + +# +# build the algorithm into mac80211. +# +CONFIG_MAC80211_RC_DEFAULT="pid" +CONFIG_MAC80211_RC_PID=y +# CONFIG_MAC80211_MESH is not set +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set +# CONFIG_MAC80211_DEBUG is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -449,13 +713,15 @@ CONFIG_IPV6_SIT=y # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set +CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set -# CONFIG_CONNECTOR is not set +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y # CONFIG_MTD is not set # CONFIG_PARPORT is not set CONFIG_PNP=y @@ -466,7 +732,7 @@ CONFIG_PNP=y # CONFIG_PNPACPI=y CONFIG_BLK_DEV=y -CONFIG_BLK_DEV_FD=y +# CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set # CONFIG_BLK_DEV_DAC960 is not set @@ -479,8 +745,8 @@ CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_UB is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=4096 -CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 +CONFIG_BLK_DEV_RAM_SIZE=16384 +# CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set CONFIG_MISC_DEVICES=y @@ -489,73 +755,17 @@ CONFIG_MISC_DEVICES=y # CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set +# CONFIG_ACER_WMI is not set +# CONFIG_ASUS_LAPTOP is not set +# CONFIG_FUJITSU_LAPTOP is not set +# CONFIG_TC1100_WMI is not set +# CONFIG_MSI_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -CONFIG_IDE=y -CONFIG_BLK_DEV_IDE=y - -# -# Please see Documentation/ide.txt for help/info on IDE drives -# -# CONFIG_BLK_DEV_IDE_SATA is not set -# CONFIG_BLK_DEV_HD_IDE is not set -CONFIG_BLK_DEV_IDEDISK=y -CONFIG_IDEDISK_MULTI_MODE=y -CONFIG_BLK_DEV_IDECD=y -# CONFIG_BLK_DEV_IDETAPE is not set -# CONFIG_BLK_DEV_IDEFLOPPY is not set -# CONFIG_BLK_DEV_IDESCSI is not set -CONFIG_BLK_DEV_IDEACPI=y -# CONFIG_IDE_TASK_IOCTL is not set -CONFIG_IDE_PROC_FS=y - -# -# IDE chipset support/bugfixes -# -CONFIG_IDE_GENERIC=y -# CONFIG_BLK_DEV_CMD640 is not set -# CONFIG_BLK_DEV_IDEPNP is not set -CONFIG_BLK_DEV_IDEPCI=y -# CONFIG_IDEPCI_SHARE_IRQ is not set -CONFIG_IDEPCI_PCIBUS_ORDER=y -# CONFIG_BLK_DEV_OFFBOARD is not set -# CONFIG_BLK_DEV_GENERIC is not set -# CONFIG_BLK_DEV_OPTI621 is not set -# CONFIG_BLK_DEV_RZ1000 is not set -CONFIG_BLK_DEV_IDEDMA_PCI=y -# CONFIG_BLK_DEV_IDEDMA_FORCED is not set -# CONFIG_IDEDMA_ONLYDISK is not set -# CONFIG_BLK_DEV_AEC62XX is not set -# CONFIG_BLK_DEV_ALI15X3 is not set -CONFIG_BLK_DEV_AMD74XX=y -# CONFIG_BLK_DEV_ATIIXP is not set -# CONFIG_BLK_DEV_CMD64X is not set -# CONFIG_BLK_DEV_TRIFLEX is not set -# CONFIG_BLK_DEV_CY82C693 is not set -# CONFIG_BLK_DEV_CS5520 is not set -# CONFIG_BLK_DEV_CS5530 is not set -# CONFIG_BLK_DEV_CS5535 is not set -# CONFIG_BLK_DEV_HPT34X is not set -# CONFIG_BLK_DEV_HPT366 is not set -# CONFIG_BLK_DEV_JMICRON is not set -# CONFIG_BLK_DEV_SC1200 is not set -CONFIG_BLK_DEV_PIIX=y -# CONFIG_BLK_DEV_IT8213 is not set -# CONFIG_BLK_DEV_IT821X is not set -# CONFIG_BLK_DEV_NS87415 is not set -# CONFIG_BLK_DEV_PDC202XX_OLD is not set -# CONFIG_BLK_DEV_PDC202XX_NEW is not set -# CONFIG_BLK_DEV_SVWKS is not set -# CONFIG_BLK_DEV_SIIMAGE is not set -# CONFIG_BLK_DEV_SIS5513 is not set -# CONFIG_BLK_DEV_SLC90E66 is not set -# CONFIG_BLK_DEV_TRM290 is not set -# CONFIG_BLK_DEV_VIA82CXXX is not set -# CONFIG_BLK_DEV_TC86C001 is not set -# CONFIG_IDE_ARM is not set -CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_IDEDMA_IVB is not set -# CONFIG_BLK_DEV_HD is not set +# CONFIG_INTEL_MENLOW is not set +# CONFIG_ENCLOSURE_SERVICES is not set +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set # # SCSI device support @@ -564,8 +774,8 @@ CONFIG_BLK_DEV_IDEDMA=y CONFIG_SCSI=y CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set -CONFIG_SCSI_NETLINK=y -# CONFIG_SCSI_PROC_FS is not set +# CONFIG_SCSI_NETLINK is not set +CONFIG_SCSI_PROC_FS=y # # SCSI support type (disk, tape, CD-ROM) @@ -574,7 +784,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_BLK_DEV_SR_VENDOR=y CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set @@ -582,7 +792,7 @@ CONFIG_CHR_DEV_SG=y # Some SCSI devices (e.g. CD jukebox) support multiple LUNs # # CONFIG_SCSI_MULTI_LUN is not set -# CONFIG_SCSI_CONSTANTS is not set +CONFIG_SCSI_CONSTANTS=y # CONFIG_SCSI_LOGGING is not set # CONFIG_SCSI_SCAN_ASYNC is not set CONFIG_SCSI_WAIT_SCAN=m @@ -591,81 +801,37 @@ CONFIG_SCSI_WAIT_SCAN=m # SCSI Transports # CONFIG_SCSI_SPI_ATTRS=y -CONFIG_SCSI_FC_ATTRS=y +# CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set - -# -# SCSI low-level drivers -# -# CONFIG_ISCSI_TCP is not set -CONFIG_BLK_DEV_3W_XXXX_RAID=y -# CONFIG_SCSI_3W_9XXX is not set -# CONFIG_SCSI_ACARD is not set -# CONFIG_SCSI_AACRAID is not set -CONFIG_SCSI_AIC7XXX=y -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=5000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -# CONFIG_SCSI_AIC7XXX_OLD is not set -CONFIG_SCSI_AIC79XX=y -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=4000 -# CONFIG_AIC79XX_DEBUG_ENABLE is not set -CONFIG_AIC79XX_DEBUG_MASK=0 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -# CONFIG_SCSI_AIC94XX is not set -# CONFIG_SCSI_DPT_I2O is not set -# CONFIG_SCSI_ADVANSYS is not set -# CONFIG_SCSI_ARCMSR is not set -# CONFIG_MEGARAID_NEWGEN is not set -# CONFIG_MEGARAID_LEGACY is not set -# CONFIG_MEGARAID_SAS is not set -# CONFIG_SCSI_HPTIOP is not set -# CONFIG_SCSI_BUSLOGIC is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_FUTURE_DOMAIN is not set -# CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_STEX is not set -# CONFIG_SCSI_SYM53C8XX_2 is not set -# CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_1280 is not set -# CONFIG_SCSI_QLA_FC is not set -# CONFIG_SCSI_QLA_ISCSI is not set -# CONFIG_SCSI_LPFC is not set -# CONFIG_SCSI_DC395x is not set -# CONFIG_SCSI_DC390T is not set -# CONFIG_SCSI_NSP32 is not set -# CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_SRP is not set +# CONFIG_SCSI_SRP_ATTRS is not set +# CONFIG_SCSI_LOWLEVEL is not set +# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y +CONFIG_SATA_PMP=y CONFIG_SATA_AHCI=y -CONFIG_SATA_SVW=y +# CONFIG_SATA_SIL24 is not set +CONFIG_ATA_SFF=y +# CONFIG_SATA_SVW is not set CONFIG_ATA_PIIX=y # CONFIG_SATA_MV is not set -CONFIG_SATA_NV=y +# CONFIG_SATA_NV is not set # CONFIG_PDC_ADMA is not set # CONFIG_SATA_QSTOR is not set # CONFIG_SATA_PROMISE is not set # CONFIG_SATA_SX4 is not set -CONFIG_SATA_SIL=y -# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIL is not set # CONFIG_SATA_SIS is not set # CONFIG_SATA_ULI is not set -CONFIG_SATA_VIA=y +# CONFIG_SATA_VIA is not set # CONFIG_SATA_VITESSE is not set # CONFIG_SATA_INIC162X is not set +# CONFIG_PATA_ACPI is not set # CONFIG_PATA_ALI is not set -# CONFIG_PATA_AMD is not set +CONFIG_PATA_AMD=y # CONFIG_PATA_ARTOP is not set # CONFIG_PATA_ATIIXP is not set # CONFIG_PATA_CMD640_PCI is not set @@ -673,6 +839,7 @@ CONFIG_SATA_VIA=y # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set # CONFIG_PATA_CS5535 is not set +# CONFIG_PATA_CS5536 is not set # CONFIG_PATA_CYPRESS is not set # CONFIG_PATA_EFAR is not set # CONFIG_ATA_GENERIC is not set @@ -686,11 +853,14 @@ CONFIG_SATA_VIA=y # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set # CONFIG_PATA_MPIIX is not set -# CONFIG_PATA_OLDPIIX is not set +CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NINJA32 is not set # CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_NS87415 is not set # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PCMCIA is not set # CONFIG_PATA_PDC_OLD is not set # CONFIG_PATA_RADISYS is not set # CONFIG_PATA_RZ1000 is not set @@ -702,65 +872,42 @@ CONFIG_SATA_VIA=y # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set CONFIG_MD=y -# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_LINEAR is not set +# CONFIG_MD_RAID0 is not set +# CONFIG_MD_RAID1 is not set +# CONFIG_MD_RAID10 is not set +# CONFIG_MD_RAID456 is not set +# CONFIG_MD_MULTIPATH is not set +# CONFIG_MD_FAULTY is not set CONFIG_BLK_DEV_DM=y # CONFIG_DM_DEBUG is not set # CONFIG_DM_CRYPT is not set # CONFIG_DM_SNAPSHOT is not set -# CONFIG_DM_MIRROR is not set -# CONFIG_DM_ZERO is not set +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y # CONFIG_DM_MULTIPATH is not set # CONFIG_DM_DELAY is not set - -# -# Fusion MPT device support -# -CONFIG_FUSION=y -CONFIG_FUSION_SPI=y -# CONFIG_FUSION_FC is not set -# CONFIG_FUSION_SAS is not set -CONFIG_FUSION_MAX_SGE=128 -# CONFIG_FUSION_CTL is not set +# CONFIG_DM_UEVENT is not set +# CONFIG_FUSION is not set # # IEEE 1394 (FireWire) support # # CONFIG_FIREWIRE is not set -CONFIG_IEEE1394=y - -# -# Subsystem Options -# -# CONFIG_IEEE1394_VERBOSEDEBUG is not set - -# -# Controllers -# - -# -# Texas Instruments PCILynx requires I2C -# -CONFIG_IEEE1394_OHCI1394=y - -# -# Protocols -# -# CONFIG_IEEE1394_VIDEO1394 is not set -# CONFIG_IEEE1394_SBP2 is not set -# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set -# CONFIG_IEEE1394_ETH1394 is not set -# CONFIG_IEEE1394_DV1394 is not set -CONFIG_IEEE1394_RAWIO=y +# CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y -# CONFIG_MAC_EMUMOUSEBTN is not set +CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -CONFIG_NETDEVICES_MULTIQUEUE=y +# CONFIG_NETDEVICES_MULTIQUEUE is not set +# CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set +# CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set @@ -770,38 +917,40 @@ CONFIG_MII=y # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set CONFIG_NET_VENDOR_3COM=y -CONFIG_VORTEX=y +# CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set -CONFIG_TULIP=y -# CONFIG_TULIP_MWI is not set -# CONFIG_TULIP_MMIO is not set -# CONFIG_TULIP_NAPI is not set +# CONFIG_TULIP is not set # CONFIG_DE4X5 is not set # CONFIG_WINBOND_840 is not set # CONFIG_DM9102 is not set # CONFIG_ULI526X is not set +# CONFIG_PCMCIA_XIRCOM is not set # CONFIG_HP100 is not set +# CONFIG_IBM_NEW_EMAC_ZMII is not set +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set # CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set -CONFIG_B44=y +# CONFIG_B44 is not set CONFIG_FORCEDETH=y # CONFIG_FORCEDETH_NAPI is not set -# CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set -CONFIG_8139CP=y +# CONFIG_8139CP is not set CONFIG_8139TOO=y -# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_PIO=y # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set # CONFIG_8139_OLD_RX_RESET is not set +# CONFIG_R6040 is not set # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set @@ -814,38 +963,75 @@ CONFIG_NETDEV_1000=y CONFIG_E1000=y # CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set -CONFIG_E1000E=y -CONFIG_E1000E_ENABLED=y +# CONFIG_E1000E is not set +# CONFIG_E1000E_ENABLED is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set -CONFIG_R8169=y -# CONFIG_R8169_NAPI is not set +# CONFIG_R8169 is not set # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y +# CONFIG_SKY2_DEBUG is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y -CONFIG_BNX2=y +# CONFIG_BNX2 is not set # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set +# CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set +# CONFIG_NIU is not set # CONFIG_MLX4_CORE is not set -# CONFIG_TR is not set +# CONFIG_TEHUTI is not set +# CONFIG_BNX2X is not set +# CONFIG_SFC is not set +CONFIG_TR=y +# CONFIG_IBMOL is not set +# CONFIG_IBMLS is not set +# CONFIG_3C359 is not set +# CONFIG_TMS380TR is not set # # Wireless LAN # # CONFIG_WLAN_PRE80211 is not set -# CONFIG_WLAN_80211 is not set +CONFIG_WLAN_80211=y +# CONFIG_PCMCIA_RAYCS is not set +# CONFIG_IPW2100 is not set +# CONFIG_IPW2200 is not set +# CONFIG_LIBERTAS is not set +# CONFIG_AIRO is not set +# CONFIG_HERMES is not set +# CONFIG_ATMEL is not set +# CONFIG_AIRO_CS is not set +# CONFIG_PCMCIA_WL3501 is not set +# CONFIG_PRISM54 is not set +# CONFIG_USB_ZD1201 is not set +# CONFIG_USB_NET_RNDIS_WLAN is not set +# CONFIG_RTL8180 is not set +# CONFIG_RTL8187 is not set +# CONFIG_ADM8211 is not set +# CONFIG_P54_COMMON is not set +CONFIG_ATH5K=y +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_IWLWIFI is not set +# CONFIG_IWLCORE is not set +# CONFIG_IWLWIFI_LEDS is not set +# CONFIG_IWL4965 is not set +# CONFIG_IWL3945 is not set +# CONFIG_HOSTAP is not set +# CONFIG_B43 is not set +# CONFIG_B43LEGACY is not set +# CONFIG_ZD1211RW is not set +# CONFIG_RT2X00 is not set # # USB Network Adapters @@ -854,16 +1040,27 @@ CONFIG_NETDEV_10000=y # CONFIG_USB_KAWETH is not set # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set -# CONFIG_USB_USBNET_MII is not set # CONFIG_USB_USBNET is not set +CONFIG_NET_PCMCIA=y +# CONFIG_PCMCIA_3C589 is not set +# CONFIG_PCMCIA_3C574 is not set +# CONFIG_PCMCIA_FMVJ18X is not set +# CONFIG_PCMCIA_PCNET is not set +# CONFIG_PCMCIA_NMCLAN is not set +# CONFIG_PCMCIA_SMC91C92 is not set +# CONFIG_PCMCIA_XIRC2PS is not set +# CONFIG_PCMCIA_AXNET is not set +# CONFIG_PCMCIA_IBMTR is not set # CONFIG_WAN is not set -# CONFIG_FDDI is not set +CONFIG_FDDI=y +# CONFIG_DEFXX is not set +# CONFIG_SKFP is not set # CONFIG_HIPPI is not set # CONFIG_PPP is not set # CONFIG_SLIP is not set # CONFIG_NET_FC is not set -# CONFIG_SHAPER is not set CONFIG_NETCONSOLE=y +# CONFIG_NETCONSOLE_DYNAMIC is not set CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y @@ -874,18 +1071,17 @@ CONFIG_NET_POLL_CONTROLLER=y # Input device support # CONFIG_INPUT=y -# CONFIG_INPUT_FF_MEMLESS is not set -# CONFIG_INPUT_POLLDEV is not set +CONFIG_INPUT_FF_MEMLESS=y +CONFIG_INPUT_POLLDEV=y # # Userland interfaces # CONFIG_INPUT_MOUSEDEV=y -CONFIG_INPUT_MOUSEDEV_PSAUX=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_JOYDEV is not set -# CONFIG_INPUT_TSDEV is not set CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_EVBUG is not set @@ -910,17 +1106,63 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set # CONFIG_MOUSE_VSXXXAA is not set -# CONFIG_INPUT_JOYSTICK is not set -# CONFIG_INPUT_TABLET is not set -# CONFIG_INPUT_TOUCHSCREEN is not set -# CONFIG_INPUT_MISC is not set +CONFIG_INPUT_JOYSTICK=y +# CONFIG_JOYSTICK_ANALOG is not set +# CONFIG_JOYSTICK_A3D is not set +# CONFIG_JOYSTICK_ADI is not set +# CONFIG_JOYSTICK_COBRA is not set +# CONFIG_JOYSTICK_GF2K is not set +# CONFIG_JOYSTICK_GRIP is not set +# CONFIG_JOYSTICK_GRIP_MP is not set +# CONFIG_JOYSTICK_GUILLEMOT is not set +# CONFIG_JOYSTICK_INTERACT is not set +# CONFIG_JOYSTICK_SIDEWINDER is not set +# CONFIG_JOYSTICK_TMDC is not set +# CONFIG_JOYSTICK_IFORCE is not set +# CONFIG_JOYSTICK_WARRIOR is not set +# CONFIG_JOYSTICK_MAGELLAN is not set +# CONFIG_JOYSTICK_SPACEORB is not set +# CONFIG_JOYSTICK_SPACEBALL is not set +# CONFIG_JOYSTICK_STINGER is not set +# CONFIG_JOYSTICK_TWIDJOY is not set +# CONFIG_JOYSTICK_ZHENHUA is not set +# CONFIG_JOYSTICK_JOYDUMP is not set +# CONFIG_JOYSTICK_XPAD is not set +CONFIG_INPUT_TABLET=y +# CONFIG_TABLET_USB_ACECAD is not set +# CONFIG_TABLET_USB_AIPTEK is not set +# CONFIG_TABLET_USB_GTCO is not set +# CONFIG_TABLET_USB_KBTAB is not set +# CONFIG_TABLET_USB_WACOM is not set +CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_FUJITSU is not set +# CONFIG_TOUCHSCREEN_GUNZE is not set +# CONFIG_TOUCHSCREEN_ELO is not set +# CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_MK712 is not set +# CONFIG_TOUCHSCREEN_PENMOUNT is not set +# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set +# CONFIG_TOUCHSCREEN_TOUCHWIN is not set +# CONFIG_TOUCHSCREEN_UCB1400 is not set +# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +CONFIG_INPUT_MISC=y +# CONFIG_INPUT_PCSPKR is not set +# CONFIG_INPUT_APANEL is not set +# CONFIG_INPUT_WISTRON_BTNS is not set +# CONFIG_INPUT_ATLAS_BTNS is not set +# CONFIG_INPUT_ATI_REMOTE is not set +# CONFIG_INPUT_ATI_REMOTE2 is not set +# CONFIG_INPUT_KEYSPAN_REMOTE is not set +# CONFIG_INPUT_POWERMATE is not set +# CONFIG_INPUT_YEALINK is not set +# CONFIG_INPUT_UINPUT is not set # # Hardware I/O ports # CONFIG_SERIO=y CONFIG_SERIO_I8042=y -# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_SERPORT=y # CONFIG_SERIO_CT82C710 is not set # CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y @@ -933,8 +1175,26 @@ CONFIG_SERIO_LIBPS2=y CONFIG_VT=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y -# CONFIG_VT_HW_CONSOLE_BINDING is not set -# CONFIG_SERIAL_NONSTANDARD is not set +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_DEVKMEM=y +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_COMPUTONE is not set +# CONFIG_ROCKETPORT is not set +# CONFIG_CYCLADES is not set +# CONFIG_DIGIEPCA is not set +# CONFIG_MOXA_INTELLIO is not set +# CONFIG_MOXA_SMARTIO is not set +# CONFIG_ISI is not set +# CONFIG_SYNCLINK is not set +# CONFIG_SYNCLINKMP is not set +# CONFIG_SYNCLINK_GT is not set +# CONFIG_N_HDLC is not set +# CONFIG_RISCOM8 is not set +# CONFIG_SPECIALIX is not set +# CONFIG_SX is not set +# CONFIG_RIO is not set +# CONFIG_STALDRV is not set +# CONFIG_NOZOMI is not set # # Serial drivers @@ -944,9 +1204,14 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y -CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_CS is not set +CONFIG_SERIAL_8250_NR_UARTS=32 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -# CONFIG_SERIAL_8250_EXTENDED is not set +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_DETECT_IRQ=y +CONFIG_SERIAL_8250_RSA=y # # Non-8250 serial port support @@ -955,89 +1220,275 @@ CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set -# CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y -CONFIG_HW_RANDOM_INTEL=y -CONFIG_HW_RANDOM_AMD=y +# CONFIG_HW_RANDOM_INTEL is not set +# CONFIG_HW_RANDOM_AMD is not set CONFIG_HW_RANDOM_GEODE=y CONFIG_HW_RANDOM_VIA=y -# CONFIG_NVRAM is not set -CONFIG_RTC=y +CONFIG_NVRAM=y # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set # CONFIG_SONYPI is not set -CONFIG_AGP=y -# CONFIG_AGP_ALI is not set -# CONFIG_AGP_ATI is not set -# CONFIG_AGP_AMD is not set -CONFIG_AGP_AMD64=y -CONFIG_AGP_INTEL=y -# CONFIG_AGP_NVIDIA is not set -# CONFIG_AGP_SIS is not set -# CONFIG_AGP_SWORKS is not set -# CONFIG_AGP_VIA is not set -# CONFIG_AGP_EFFICEON is not set -# CONFIG_DRM is not set + +# +# PCMCIA character devices +# +# CONFIG_SYNCLINK_CS is not set +# CONFIG_CARDMAN_4000 is not set +# CONFIG_CARDMAN_4040 is not set +# CONFIG_IPWIRELESS is not set # CONFIG_MWAVE is not set # CONFIG_PC8736x_GPIO is not set # CONFIG_NSC_GPIO is not set # CONFIG_CS5535_GPIO is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=256 +# CONFIG_RAW_DRIVER is not set CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set -CONFIG_HPET_MMAP=y +# CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y -# CONFIG_I2C is not set - -# -# SPI support -# +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +# CONFIG_I2C_CHARDEV is not set + +# +# I2C Hardware Bus support +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +CONFIG_I2C_I801=y +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PROSAVAGE is not set +# CONFIG_I2C_SAVAGE4 is not set +# CONFIG_I2C_SIMTEC is not set +# CONFIG_SCx200_ACB is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_TINY_USB is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +# CONFIG_I2C_PCA_PLATFORM is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_EEPROM is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set -# CONFIG_SPI_MASTER is not set # CONFIG_W1 is not set -# CONFIG_POWER_SUPPLY is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +# CONFIG_PDA_POWER is not set +# CONFIG_BATTERY_DS2760 is not set # CONFIG_HWMON is not set +CONFIG_THERMAL=y +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +# CONFIG_SOFT_WATCHDOG is not set +# CONFIG_ACQUIRE_WDT is not set +# CONFIG_ADVANTECH_WDT is not set +# CONFIG_ALIM1535_WDT is not set +# CONFIG_ALIM7101_WDT is not set +# CONFIG_SC520_WDT is not set +# CONFIG_EUROTECH_WDT is not set +# CONFIG_IB700_WDT is not set +# CONFIG_IBMASR is not set +# CONFIG_WAFER_WDT is not set +# CONFIG_I6300ESB_WDT is not set +# CONFIG_ITCO_WDT is not set +# CONFIG_IT8712F_WDT is not set +# CONFIG_HP_WATCHDOG is not set +# CONFIG_SC1200_WDT is not set +# CONFIG_PC87413_WDT is not set +# CONFIG_60XX_WDT is not set +# CONFIG_SBC8360_WDT is not set +# CONFIG_SBC7240_WDT is not set +# CONFIG_CPU5_WDT is not set +# CONFIG_SMSC37B787_WDT is not set +# CONFIG_W83627HF_WDT is not set +# CONFIG_W83697HF_WDT is not set +# CONFIG_W83877F_WDT is not set +# CONFIG_W83977F_WDT is not set +# CONFIG_MACHZ_WDT is not set +# CONFIG_SBC_EPX_C3_WATCHDOG is not set + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set + +# +# Sonics Silicon Backplane +# +CONFIG_SSB_POSSIBLE=y +# CONFIG_SSB is not set # # Multifunction device drivers # # CONFIG_MFD_SM501 is not set +# CONFIG_HTC_PASIC3 is not set # # Multimedia devices # + +# +# Multimedia core support +# # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set + +# +# Multimedia drivers +# CONFIG_DAB=y # CONFIG_USB_DABUSB is not set # # Graphics support # -# CONFIG_BACKLIGHT_LCD_SUPPORT is not set +CONFIG_AGP=y +# CONFIG_AGP_ALI is not set +# CONFIG_AGP_ATI is not set +# CONFIG_AGP_AMD is not set +CONFIG_AGP_AMD64=y +CONFIG_AGP_INTEL=y +# CONFIG_AGP_NVIDIA is not set +# CONFIG_AGP_SIS is not set +# CONFIG_AGP_SWORKS is not set +# CONFIG_AGP_VIA is not set +# CONFIG_AGP_EFFICEON is not set +CONFIG_DRM=y +# CONFIG_DRM_TDFX is not set +# CONFIG_DRM_R128 is not set +# CONFIG_DRM_RADEON is not set +# CONFIG_DRM_I810 is not set +# CONFIG_DRM_I830 is not set +CONFIG_DRM_I915=y +# CONFIG_DRM_MGA is not set +# CONFIG_DRM_SIS is not set +# CONFIG_DRM_VIA is not set +# CONFIG_DRM_SAVAGE is not set +# CONFIG_VGASTATE is not set +# CONFIG_VIDEO_OUTPUT_CONTROL is not set +CONFIG_FB=y +# CONFIG_FIRMWARE_EDID is not set +# CONFIG_FB_DDC is not set +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set +# CONFIG_FB_SYS_FILLRECT is not set +# CONFIG_FB_SYS_COPYAREA is not set +# CONFIG_FB_SYS_IMAGEBLIT is not set +# CONFIG_FB_FOREIGN_ENDIAN is not set +# CONFIG_FB_SYS_FOPS is not set +CONFIG_FB_DEFERRED_IO=y +# CONFIG_FB_SVGALIB is not set +# CONFIG_FB_MACMODES is not set +# CONFIG_FB_BACKLIGHT is not set +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +# CONFIG_FB_VESA is not set +CONFIG_FB_EFI=y +# CONFIG_FB_IMAC is not set +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I810 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_CYBLA is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_GEODE is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_BACKLIGHT_CLASS_DEVICE=y +# CONFIG_BACKLIGHT_CORGI is not set +# CONFIG_BACKLIGHT_PROGEAR is not set # # Display device support # # CONFIG_DISPLAY_SUPPORT is not set -# CONFIG_VGASTATE is not set -# CONFIG_FB is not set # # Console display driver support # CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_LOGO_LINUX_CLUT224=y # # Sound @@ -1047,33 +1498,167 @@ CONFIG_SOUND=y # # Advanced Linux Sound Architecture # -# CONFIG_SND is not set +CONFIG_SND=y +CONFIG_SND_TIMER=y +CONFIG_SND_PCM=y +CONFIG_SND_HWDEP=y +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQ_DUMMY=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=y +CONFIG_SND_PCM_OSS=y +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_SUPPORT_OLD_API=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y + +# +# Generic devices +# +# CONFIG_SND_PCSP is not set +# CONFIG_SND_DUMMY is not set +# CONFIG_SND_VIRMIDI is not set +# CONFIG_SND_MTPAV is not set +# CONFIG_SND_SERIAL_U16550 is not set +# CONFIG_SND_MPU401 is not set + +# +# PCI devices +# +# CONFIG_SND_AD1889 is not set +# CONFIG_SND_ALS300 is not set +# CONFIG_SND_ALS4000 is not set +# CONFIG_SND_ALI5451 is not set +# CONFIG_SND_ATIIXP is not set +# CONFIG_SND_ATIIXP_MODEM is not set +# CONFIG_SND_AU8810 is not set +# CONFIG_SND_AU8820 is not set +# CONFIG_SND_AU8830 is not set +# CONFIG_SND_AW2 is not set +# CONFIG_SND_AZT3328 is not set +# CONFIG_SND_BT87X is not set +# CONFIG_SND_CA0106 is not set +# CONFIG_SND_CMIPCI is not set +# CONFIG_SND_OXYGEN is not set +# CONFIG_SND_CS4281 is not set +# CONFIG_SND_CS46XX is not set +# CONFIG_SND_CS5530 is not set +# CONFIG_SND_CS5535AUDIO is not set +# CONFIG_SND_DARLA20 is not set +# CONFIG_SND_GINA20 is not set +# CONFIG_SND_LAYLA20 is not set +# CONFIG_SND_DARLA24 is not set +# CONFIG_SND_GINA24 is not set +# CONFIG_SND_LAYLA24 is not set +# CONFIG_SND_MONA is not set +# CONFIG_SND_MIA is not set +# CONFIG_SND_ECHO3G is not set +# CONFIG_SND_INDIGO is not set +# CONFIG_SND_INDIGOIO is not set +# CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_EMU10K1 is not set +# CONFIG_SND_EMU10K1X is not set +# CONFIG_SND_ENS1370 is not set +# CONFIG_SND_ENS1371 is not set +# CONFIG_SND_ES1938 is not set +# CONFIG_SND_ES1968 is not set +# CONFIG_SND_FM801 is not set +CONFIG_SND_HDA_INTEL=y +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_CODEC_REALTEK=y +CONFIG_SND_HDA_CODEC_ANALOG=y +CONFIG_SND_HDA_CODEC_SIGMATEL=y +CONFIG_SND_HDA_CODEC_VIA=y +CONFIG_SND_HDA_CODEC_ATIHDMI=y +CONFIG_SND_HDA_CODEC_CONEXANT=y +CONFIG_SND_HDA_CODEC_CMEDIA=y +CONFIG_SND_HDA_CODEC_SI3054=y +CONFIG_SND_HDA_GENERIC=y +# CONFIG_SND_HDA_POWER_SAVE is not set +# CONFIG_SND_HDSP is not set +# CONFIG_SND_HDSPM is not set +# CONFIG_SND_HIFIER is not set +# CONFIG_SND_ICE1712 is not set +# CONFIG_SND_ICE1724 is not set +# CONFIG_SND_INTEL8X0 is not set +# CONFIG_SND_INTEL8X0M is not set +# CONFIG_SND_KORG1212 is not set +# CONFIG_SND_MAESTRO3 is not set +# CONFIG_SND_MIXART is not set +# CONFIG_SND_NM256 is not set +# CONFIG_SND_PCXHR is not set +# CONFIG_SND_RIPTIDE is not set +# CONFIG_SND_RME32 is not set +# CONFIG_SND_RME96 is not set +# CONFIG_SND_RME9652 is not set +# CONFIG_SND_SIS7019 is not set +# CONFIG_SND_SONICVIBES is not set +# CONFIG_SND_TRIDENT is not set +# CONFIG_SND_VIA82XX is not set +# CONFIG_SND_VIA82XX_MODEM is not set +# CONFIG_SND_VIRTUOSO is not set +# CONFIG_SND_VX222 is not set +# CONFIG_SND_YMFPCI is not set + +# +# USB devices +# +# CONFIG_SND_USB_AUDIO is not set +# CONFIG_SND_USB_USX2Y is not set +# CONFIG_SND_USB_CAIAQ is not set + +# +# PCMCIA devices +# +# CONFIG_SND_VXPOCKET is not set +# CONFIG_SND_PDAUDIOCF is not set + +# +# System on Chip audio support +# +# CONFIG_SND_SOC is not set + +# +# ALSA SoC audio for Freescale SOCs +# + +# +# SoC Audio for the Texas Instruments OMAP +# # # Open Sound System # -CONFIG_SOUND_PRIME=y -# CONFIG_SOUND_TRIDENT is not set -# CONFIG_SOUND_MSNDCLAS is not set -# CONFIG_SOUND_MSNDPIN is not set -# CONFIG_SOUND_OSS is not set +# CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y -# CONFIG_HID_DEBUG is not set +CONFIG_HID_DEBUG=y +CONFIG_HIDRAW=y # # USB Input Devices # CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set +CONFIG_USB_HIDINPUT_POWERBOOK=y +CONFIG_HID_FF=y +CONFIG_HID_PID=y +CONFIG_LOGITECH_FF=y +# CONFIG_LOGIRUMBLEPAD2_FF is not set +CONFIG_PANTHERLORD_FF=y +CONFIG_THRUSTMASTER_FF=y +CONFIG_ZEROPLUS_FF=y +CONFIG_USB_HIDDEV=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y CONFIG_USB=y -# CONFIG_USB_DEBUG is not set +CONFIG_USB_DEBUG=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # # Miscellaneous USB options @@ -1081,18 +1666,18 @@ CONFIG_USB=y CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set -# CONFIG_USB_SUSPEND is not set -# CONFIG_USB_PERSIST is not set +CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set # # USB Host Controller Drivers # +# CONFIG_USB_C67X00_HCD is not set CONFIG_USB_EHCI_HCD=y -# CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_ISP116X_HCD is not set +# CONFIG_USB_ISP1760_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set # CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set @@ -1125,8 +1710,10 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_SDDR55 is not set # CONFIG_USB_STORAGE_JUMPSHOT is not set # CONFIG_USB_STORAGE_ALAUDA is not set +# CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set -# CONFIG_USB_LIBUSUAL is not set +# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set +CONFIG_USB_LIBUSUAL=y # # USB Imaging devices @@ -1138,10 +1725,6 @@ CONFIG_USB_MON=y # # USB port drivers # - -# -# USB Serial Converter support -# # CONFIG_USB_SERIAL is not set # @@ -1167,90 +1750,125 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set +# CONFIG_USB_GADGET is not set +# CONFIG_MMC is not set +# CONFIG_MEMSTICK is not set +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y # -# USB DSL modem support +# LED drivers # +# CONFIG_LEDS_CLEVO_MAIL is not set # -# USB Gadget Support +# LED Triggers # -# CONFIG_USB_GADGET is not set -# CONFIG_MMC is not set +CONFIG_LEDS_TRIGGERS=y +# CONFIG_LEDS_TRIGGER_TIMER is not set +# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set +# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set +# CONFIG_ACCESSIBILITY is not set +# CONFIG_INFINIBAND is not set +CONFIG_EDAC=y # -# LED devices +# Reporting subsystems # -# CONFIG_NEW_LEDS is not set +# CONFIG_EDAC_DEBUG is not set +# CONFIG_EDAC_MM_EDAC is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +# CONFIG_RTC_DEBUG is not set # -# LED drivers +# RTC interfaces # +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set # -# LED Triggers +# I2C RTC drivers # -# CONFIG_INFINIBAND is not set -# CONFIG_EDAC is not set +# CONFIG_RTC_DRV_DS1307 is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_S35390A is not set # -# Real Time Clock +# SPI RTC drivers # -# CONFIG_RTC_CLASS is not set # -# DMA Engine support +# Platform RTC drivers # -# CONFIG_DMA_ENGINE is not set +CONFIG_RTC_DRV_CMOS=y +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_V3020 is not set # -# DMA Clients +# on-CPU RTC drivers # +CONFIG_DMADEVICES=y # # DMA Devices # -CONFIG_VIRTUALIZATION=y -# CONFIG_KVM is not set +# CONFIG_INTEL_IOATDMA is not set +# CONFIG_UIO is not set # -# Userspace I/O +# Firmware Drivers # -# CONFIG_UIO is not set +# CONFIG_EDD is not set +CONFIG_EFI_VARS=y +# CONFIG_DELL_RBU is not set +# CONFIG_DCDBAS is not set +CONFIG_DMIID=y +# CONFIG_ISCSI_IBFT_FIND is not set # # File systems # -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -# CONFIG_EXT2_FS_SECURITY is not set -# CONFIG_EXT2_FS_XIP is not set +# CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y -# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_EXT3_FS_SECURITY=y # CONFIG_EXT4DEV_FS is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=y -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -# CONFIG_REISERFS_FS_SECURITY is not set +# CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set -# CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_ROMFS_FS is not set +CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y -# CONFIG_QUOTA is not set -CONFIG_DNOTIFY=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QFMT_V1 is not set +CONFIG_QFMT_V2=y +CONFIG_QUOTACTL=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set @@ -1260,8 +1878,8 @@ CONFIG_GENERIC_ACL=y # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y -# CONFIG_JOLIET is not set -# CONFIG_ZISOFS is not set +CONFIG_JOLIET=y +CONFIG_ZISOFS=y # CONFIG_UDF_FS is not set # @@ -1279,13 +1897,13 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y -CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set # @@ -1293,6 +1911,7 @@ CONFIG_RAMFS=y # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +# CONFIG_ECRYPT_FS is not set # CONFIG_HFS_FS is not set # CONFIG_HFSPLUS_FS is not set # CONFIG_BEFS_FS is not set @@ -1300,33 +1919,15 @@ CONFIG_RAMFS=y # CONFIG_EFS_FS is not set # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set +# CONFIG_MINIX_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set +# CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -# CONFIG_NFS_V3_ACL is not set -# CONFIG_NFS_V4 is not set -# CONFIG_NFS_DIRECTIO is not set -CONFIG_NFSD=y -CONFIG_NFSD_V3=y -# CONFIG_NFSD_V3_ACL is not set -# CONFIG_NFSD_V4 is not set -CONFIG_NFSD_TCP=y -CONFIG_ROOT_NFS=y -CONFIG_LOCKD=y -CONFIG_LOCKD_V4=y -CONFIG_EXPORTFS=y -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=y -# CONFIG_SUNRPC_BIND34 is not set -# CONFIG_RPCSEC_GSS_KRB5 is not set -# CONFIG_RPCSEC_GSS_SPKM3 is not set +CONFIG_NETWORK_FILESYSTEMS=y +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set @@ -1336,14 +1937,26 @@ CONFIG_SUNRPC=y # # Partition Types # -# CONFIG_PARTITION_ADVANCED is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set CONFIG_NLS=y -CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y # CONFIG_NLS_CODEPAGE_737 is not set # CONFIG_NLS_CODEPAGE_775 is not set @@ -1378,37 +1991,33 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_ISO8859_9 is not set # CONFIG_NLS_ISO8859_13 is not set # CONFIG_NLS_ISO8859_14 is not set -CONFIG_NLS_ISO8859_15=y +# CONFIG_NLS_ISO8859_15 is not set # CONFIG_NLS_KOI8_R is not set # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y - -# -# Distributed Lock Manager -# # CONFIG_DLM is not set -CONFIG_INSTRUMENTATION=y -CONFIG_PROFILING=y -CONFIG_OPROFILE=y -CONFIG_KPROBES=y # # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y -CONFIG_UNUSED_SYMBOLS=y -# CONFIG_DEBUG_FS is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set -CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHED_DEBUG is not set -# CONFIG_SCHEDSTATS is not set +CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y +# CONFIG_DEBUG_OBJECTS is not set # CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set @@ -1423,48 +2032,174 @@ CONFIG_TIMER_STATS=y CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_WRITECOUNT is not set # CONFIG_DEBUG_LIST is not set -# CONFIG_FRAME_POINTER is not set -CONFIG_OPTIMIZE_INLINING=y +# CONFIG_DEBUG_SG is not set +CONFIG_FRAME_POINTER=y +# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set +# CONFIG_LATENCYTOP is not set +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y +# CONFIG_SAMPLES is not set +# CONFIG_KGDB is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_NONPROMISC_DEVMEM is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_RODATA is not set +CONFIG_DEBUG_STACK_USAGE=y +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_X86_PTDUMP is not set +CONFIG_DEBUG_RODATA=y +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y CONFIG_DOUBLEFAULT=y +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set # # Security options # -# CONFIG_KEYS is not set -# CONFIG_SECURITY is not set -# CONFIG_CRYPTO is not set +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +# CONFIG_SECURITY_NETWORK_XFRM is not set +CONFIG_SECURITY_CAPABILITIES=y +CONFIG_SECURITY_FILE_CAPABILITIES=y +# CONFIG_SECURITY_ROOTPLUG is not set +CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 +CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 +# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set +# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set +# CONFIG_SECURITY_SMACK is not set +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_MANAGER=y +# CONFIG_CRYPTO_GF128MUL is not set +# CONFIG_CRYPTO_NULL is not set +# CONFIG_CRYPTO_CRYPTD is not set +CONFIG_CRYPTO_AUTHENC=y +# CONFIG_CRYPTO_TEST is not set + +# +# Authenticated Encryption with Associated Data +# +# CONFIG_CRYPTO_CCM is not set +# CONFIG_CRYPTO_GCM is not set +# CONFIG_CRYPTO_SEQIV is not set + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +# CONFIG_CRYPTO_CTR is not set +# CONFIG_CRYPTO_CTS is not set +CONFIG_CRYPTO_ECB=y +# CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_PCBC is not set +# CONFIG_CRYPTO_XTS is not set + +# +# Hash modes +# +CONFIG_CRYPTO_HMAC=y +# CONFIG_CRYPTO_XCBC is not set + +# +# Digest +# +# CONFIG_CRYPTO_CRC32C is not set +# CONFIG_CRYPTO_MD4 is not set +CONFIG_CRYPTO_MD5=y +# CONFIG_CRYPTO_MICHAEL_MIC is not set +CONFIG_CRYPTO_SHA1=y +# CONFIG_CRYPTO_SHA256 is not set +# CONFIG_CRYPTO_SHA512 is not set +# CONFIG_CRYPTO_TGR192 is not set +# CONFIG_CRYPTO_WP512 is not set + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +# CONFIG_CRYPTO_AES_586 is not set +# CONFIG_CRYPTO_ANUBIS is not set +CONFIG_CRYPTO_ARC4=y +# CONFIG_CRYPTO_BLOWFISH is not set +# CONFIG_CRYPTO_CAMELLIA is not set +# CONFIG_CRYPTO_CAST5 is not set +# CONFIG_CRYPTO_CAST6 is not set +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_FCRYPT is not set +# CONFIG_CRYPTO_KHAZAD is not set +# CONFIG_CRYPTO_SALSA20 is not set +# CONFIG_CRYPTO_SALSA20_586 is not set +# CONFIG_CRYPTO_SEED is not set +# CONFIG_CRYPTO_SERPENT is not set +# CONFIG_CRYPTO_TEA is not set +# CONFIG_CRYPTO_TWOFISH is not set +# CONFIG_CRYPTO_TWOFISH_586 is not set + +# +# Compression +# +# CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_LZO is not set +CONFIG_CRYPTO_HW=y +# CONFIG_CRYPTO_DEV_PADLOCK is not set +# CONFIG_CRYPTO_DEV_GEODE is not set +# CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_HAVE_KVM=y +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set +# CONFIG_LGUEST is not set +# CONFIG_VIRTIO_PCI is not set +# CONFIG_VIRTIO_BALLOON is not set # # Library routines # CONFIG_BITREVERSE=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set +CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y -CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y -CONFIG_X86_TRAMPOLINE=y -CONFIG_KTIME_SCALAR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2f1b356..ae5124e 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.26-rc1 -# Sun May 4 19:27:29 2008 +# Sun May 4 19:59:57 2008 # CONFIG_64BIT=y # CONFIG_X86_32 is not set @@ -61,27 +61,42 @@ CONFIG_EXPERIMENTAL=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y +# CONFIG_LOCALVERSION_AUTO is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y -# CONFIG_BSD_PROCESS_ACCT is not set -# CONFIG_TASKSTATS is not set -# CONFIG_AUDIT is not set -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=18 -# CONFIG_CGROUPS is not set -# CONFIG_GROUP_SCHED is not set +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_AUDIT_TREE=y +# CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_CGROUPS=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_CGROUP_NS=y +# CONFIG_CGROUP_DEVICE is not set +CONFIG_CPUSETS=y +CONFIG_GROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +# CONFIG_RT_GROUP_SCHED is not set # CONFIG_USER_SCHED is not set -# CONFIG_CGROUP_SCHED is not set +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_RESOURCE_COUNTERS=y +# CONFIG_CGROUP_MEM_RES_CTLR is not set # CONFIG_SYSFS_DEPRECATED_V2 is not set +CONFIG_PROC_PID_CPUSET=y CONFIG_RELAY=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y CONFIG_IPC_NS=y -# CONFIG_USER_NS is not set +CONFIG_USER_NS=y CONFIG_PID_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" @@ -93,7 +108,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y -# CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -108,12 +123,13 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLAB=y -# CONFIG_SLUB is not set +CONFIG_SLUB_DEBUG=y +# CONFIG_SLAB is not set +CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_PROFILING=y -# CONFIG_MARKERS is not set -CONFIG_OPROFILE=y +CONFIG_MARKERS=y +# CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y CONFIG_KRETPROBES=y @@ -133,15 +149,15 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -# CONFIG_BLK_DEV_IO_TRACE is not set -# CONFIG_BLK_DEV_BSG is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_BLK_DEV_BSG=y CONFIG_BLOCK_COMPAT=y # # IO Schedulers # CONFIG_IOSCHED_NOOP=y -# CONFIG_IOSCHED_AS is not set +CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y # CONFIG_DEFAULT_AS is not set @@ -155,7 +171,7 @@ CONFIG_CLASSIC_RCU=y # Processor type and features # CONFIG_TICK_ONESHOT=y -# CONFIG_NO_HZ is not set +CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y @@ -171,7 +187,8 @@ CONFIG_X86_PC=y # CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set # CONFIG_PARAVIRT_GUEST is not set -# CONFIG_MEMTEST_BOOTPARAM is not set +CONFIG_MEMTEST_BOOTPARAM=y +CONFIG_MEMTEST_BOOTPARAM_VALUE=0 # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -196,14 +213,17 @@ CONFIG_X86_PC=y # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -CONFIG_GENERIC_CPU=y +CONFIG_MCORE2=y +# CONFIG_GENERIC_CPU is not set CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_GOOD_APIC=y +CONFIG_X86_INTEL_USERCOPY=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=64 @@ -212,20 +232,19 @@ CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_DMI=y CONFIG_GART_IOMMU=y -# CONFIG_CALGARY_IOMMU is not set +CONFIG_CALGARY_IOMMU=y +CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y -CONFIG_NR_CPUS=32 -CONFIG_SCHED_SMT=y +CONFIG_NR_CPUS=4 +# CONFIG_SCHED_SMT is not set CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y -CONFIG_X86_MCE=y -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y +# CONFIG_X86_MCE is not set # CONFIG_I8K is not set # CONFIG_MICROCODE is not set CONFIG_X86_MSR=y @@ -234,7 +253,7 @@ CONFIG_NUMA=y CONFIG_K8_NUMA=y CONFIG_X86_64_ACPI_NUMA=y CONFIG_NODES_SPAN_OTHER_NODES=y -CONFIG_NUMA_EMU=y +# CONFIG_NUMA_EMU is not set CONFIG_NODES_SHIFT=6 CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_ARCH_SPARSEMEM_ENABLE=y @@ -249,7 +268,7 @@ CONFIG_HAVE_MEMORY_PRESENT=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPARSEMEM_EXTREME=y CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -# CONFIG_SPARSEMEM_VMEMMAP is not set +CONFIG_SPARSEMEM_VMEMMAP=y # # Memory hotplug is currently incompatible with Software Suspend @@ -263,18 +282,18 @@ CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MTRR=y # CONFIG_X86_PAT is not set -# CONFIG_EFI is not set +CONFIG_EFI=y CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ_1000=y +CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y -# CONFIG_KEXEC is not set -# CONFIG_CRASH_DUMP is not set -CONFIG_PHYSICAL_START=0x200000 -# CONFIG_RELOCATABLE is not set +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set @@ -286,10 +305,15 @@ CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y # CONFIG_ARCH_HIBERNATION_HEADER=y CONFIG_PM=y -# CONFIG_PM_DEBUG is not set +CONFIG_PM_DEBUG=y +# CONFIG_PM_VERBOSE is not set +CONFIG_CAN_PM_TRACE=y +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y CONFIG_PM_SLEEP_SMP=y CONFIG_PM_SLEEP=y -# CONFIG_SUSPEND is not set +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" CONFIG_ACPI=y @@ -327,35 +351,34 @@ CONFIG_ACPI_CONTAINER=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_TABLE=y CONFIG_CPU_FREQ_DEBUG=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_STAT_DETAILS is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_STAT is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y # CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set # CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set # # CPUFreq processor drivers # CONFIG_X86_ACPI_CPUFREQ=y -CONFIG_X86_POWERNOW_K8=y -CONFIG_X86_POWERNOW_K8_ACPI=y +# CONFIG_X86_POWERNOW_K8 is not set # CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=y +# CONFIG_X86_P4_CLOCKMOD is not set # # shared options # -CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y -CONFIG_X86_SPEEDSTEP_LIB=y +# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set +# CONFIG_X86_SPEEDSTEP_LIB is not set CONFIG_CPU_IDLE=y CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y # # Bus options (PCI etc.) @@ -364,28 +387,53 @@ CONFIG_PCI=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCI_DOMAINS=y -# CONFIG_DMAR is not set +CONFIG_DMAR=y +CONFIG_DMAR_GFX_WA=y +CONFIG_DMAR_FLOPPY_WA=y CONFIG_PCIEPORTBUS=y +# CONFIG_HOTPLUG_PCI_PCIE is not set CONFIG_PCIEAER=y # CONFIG_PCIEASPM is not set CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y -CONFIG_PCI_LEGACY=y +# CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set -# CONFIG_HT_IRQ is not set +CONFIG_HT_IRQ=y CONFIG_ISA_DMA_API=y CONFIG_K8_NB=y -# CONFIG_PCCARD is not set -# CONFIG_HOTPLUG_PCI is not set +CONFIG_PCCARD=y +# CONFIG_PCMCIA_DEBUG is not set +CONFIG_PCMCIA=y +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_PCMCIA_IOCTL=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=y +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +# CONFIG_PD6729 is not set +# CONFIG_I82092 is not set +CONFIG_PCCARD_NONSTATIC=y +CONFIG_HOTPLUG_PCI=y +# CONFIG_HOTPLUG_PCI_FAKE is not set +# CONFIG_HOTPLUG_PCI_ACPI is not set +# CONFIG_HOTPLUG_PCI_CPCI is not set +# CONFIG_HOTPLUG_PCI_SHPC is not set # # Executable file formats / Emulations # CONFIG_BINFMT_ELF=y CONFIG_COMPAT_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set +CONFIG_BINFMT_MISC=y CONFIG_IA32_EMULATION=y -CONFIG_IA32_AOUT=y +# CONFIG_IA32_AOUT is not set CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_SYSVIPC_COMPAT=y @@ -399,22 +447,31 @@ CONFIG_NET=y # Networking options # CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set +CONFIG_PACKET_MMAP=y CONFIG_UNIX=y +CONFIG_XFRM=y +CONFIG_XFRM_USER=y +# CONFIG_XFRM_SUB_POLICY is not set +# CONFIG_XFRM_MIGRATE is not set +# CONFIG_XFRM_STATISTICS is not set # CONFIG_NET_KEY is not set CONFIG_INET=y CONFIG_IP_MULTICAST=y -# CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_ASK_IP_FIB_HASH=y +# CONFIG_IP_FIB_TRIE is not set CONFIG_IP_FIB_HASH=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +# CONFIG_IP_PNP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set -# CONFIG_IP_MROUTE is not set +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y # CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set +CONFIG_SYN_COOKIES=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set @@ -423,34 +480,109 @@ CONFIG_INET_TUNNEL=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set -CONFIG_INET_DIAG=y -CONFIG_INET_TCP_DIAG=y -# CONFIG_TCP_CONG_ADVANCED is not set +CONFIG_INET_LRO=y +# CONFIG_INET_DIAG is not set +CONFIG_TCP_CONG_ADVANCED=y +# CONFIG_TCP_CONG_BIC is not set CONFIG_TCP_CONG_CUBIC=y +# CONFIG_TCP_CONG_WESTWOOD is not set +# CONFIG_TCP_CONG_HTCP is not set +# CONFIG_TCP_CONG_HSTCP is not set +# CONFIG_TCP_CONG_HYBLA is not set +# CONFIG_TCP_CONG_VEGAS is not set +# CONFIG_TCP_CONG_SCALABLE is not set +# CONFIG_TCP_CONG_LP is not set +# CONFIG_TCP_CONG_VENO is not set +# CONFIG_TCP_CONG_YEAH is not set +# CONFIG_TCP_CONG_ILLINOIS is not set +# CONFIG_DEFAULT_BIC is not set +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_HTCP is not set +# CONFIG_DEFAULT_VEGAS is not set +# CONFIG_DEFAULT_WESTWOOD is not set +# CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" -# CONFIG_TCP_MD5SIG is not set +CONFIG_TCP_MD5SIG=y +# CONFIG_IP_VS is not set CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set # CONFIG_IPV6_OPTIMISTIC_DAD is not set -# CONFIG_INET6_AH is not set -# CONFIG_INET6_ESP is not set +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y # CONFIG_INET6_IPCOMP is not set # CONFIG_IPV6_MIP6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set -# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set -# CONFIG_INET6_XFRM_MODE_TUNNEL is not set -# CONFIG_INET6_XFRM_MODE_BEET is not set +CONFIG_INET6_XFRM_MODE_TRANSPORT=y +CONFIG_INET6_XFRM_MODE_TUNNEL=y +CONFIG_INET6_XFRM_MODE_BEET=y # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y CONFIG_IPV6_NDISC_NODETYPE=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_IPV6_MROUTE is not set -# CONFIG_NETWORK_SECMARK is not set -# CONFIG_NETFILTER is not set +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +# CONFIG_NETFILTER_ADVANCED is not set + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_NETLINK=y +CONFIG_NETFILTER_NETLINK_LOG=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_SIP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NETFILTER_XTABLES=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_STATE=y + +# +# IP: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_PROC_COMPAT=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_TARGET_LOG=y +CONFIG_IP_NF_TARGET_ULOG=y +CONFIG_NF_NAT=y +CONFIG_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_NF_NAT_FTP=y +CONFIG_NF_NAT_IRC=y +# CONFIG_NF_NAT_TFTP is not set +# CONFIG_NF_NAT_AMANDA is not set +# CONFIG_NF_NAT_PPTP is not set +# CONFIG_NF_NAT_H323 is not set +CONFIG_NF_NAT_SIP=y +CONFIG_IP_NF_MANGLE=y + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MATCH_IPV6HEADER=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_TARGET_LOG=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IP6_NF_MANGLE=y # CONFIG_IP_DCCP is not set # CONFIG_IP_SCTP is not set # CONFIG_TIPC is not set @@ -458,6 +590,7 @@ CONFIG_IPV6_NDISC_NODETYPE=y # CONFIG_BRIDGE is not set # CONFIG_VLAN_8021Q is not set # CONFIG_DECNET is not set +CONFIG_LLC=y # CONFIG_LLC2 is not set # CONFIG_IPX is not set # CONFIG_ATALK is not set @@ -465,25 +598,99 @@ CONFIG_IPV6_NDISC_NODETYPE=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set -# CONFIG_NET_SCHED is not set +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +# CONFIG_NET_SCH_CBQ is not set +# CONFIG_NET_SCH_HTB is not set +# CONFIG_NET_SCH_HFSC is not set +# CONFIG_NET_SCH_PRIO is not set +# CONFIG_NET_SCH_RR is not set +# CONFIG_NET_SCH_RED is not set +# CONFIG_NET_SCH_SFQ is not set +# CONFIG_NET_SCH_TEQL is not set +# CONFIG_NET_SCH_TBF is not set +# CONFIG_NET_SCH_GRED is not set +# CONFIG_NET_SCH_DSMARK is not set +# CONFIG_NET_SCH_NETEM is not set +# CONFIG_NET_SCH_INGRESS is not set + +# +# Classification +# +CONFIG_NET_CLS=y +# CONFIG_NET_CLS_BASIC is not set +# CONFIG_NET_CLS_TCINDEX is not set +# CONFIG_NET_CLS_ROUTE4 is not set +# CONFIG_NET_CLS_FW is not set +# CONFIG_NET_CLS_U32 is not set +# CONFIG_NET_CLS_RSVP is not set +# CONFIG_NET_CLS_RSVP6 is not set +# CONFIG_NET_CLS_FLOW is not set +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +# CONFIG_NET_EMATCH_CMP is not set +# CONFIG_NET_EMATCH_NBYTE is not set +# CONFIG_NET_EMATCH_U32 is not set +# CONFIG_NET_EMATCH_META is not set +# CONFIG_NET_EMATCH_TEXT is not set +CONFIG_NET_CLS_ACT=y +# CONFIG_NET_ACT_POLICE is not set +# CONFIG_NET_ACT_GACT is not set +# CONFIG_NET_ACT_MIRRED is not set +# CONFIG_NET_ACT_IPT is not set +# CONFIG_NET_ACT_NAT is not set +# CONFIG_NET_ACT_PEDIT is not set +# CONFIG_NET_ACT_SIMP is not set +CONFIG_NET_SCH_FIFO=y # # Network testing # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set -# CONFIG_HAMRADIO is not set +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +# CONFIG_AX25 is not set # CONFIG_CAN is not set # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set +CONFIG_FIB_RULES=y # # Wireless # -# CONFIG_CFG80211 is not set -# CONFIG_WIRELESS_EXT is not set -# CONFIG_MAC80211 is not set +CONFIG_CFG80211=y +CONFIG_NL80211=y +CONFIG_WIRELESS_EXT=y +CONFIG_MAC80211=y + +# +# Rate control algorithm selection +# +CONFIG_MAC80211_RC_DEFAULT_PID=y +# CONFIG_MAC80211_RC_DEFAULT_NONE is not set + +# +# Selecting 'y' for an algorithm will +# + +# +# build the algorithm into mac80211. +# +CONFIG_MAC80211_RC_DEFAULT="pid" +CONFIG_MAC80211_RC_PID=y +# CONFIG_MAC80211_MESH is not set +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set +# CONFIG_MAC80211_DEBUG is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -500,9 +707,10 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set +CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set -# CONFIG_CONNECTOR is not set +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y # CONFIG_MTD is not set # CONFIG_PARPORT is not set CONFIG_PNP=y @@ -513,7 +721,7 @@ CONFIG_PNP=y # CONFIG_PNPACPI=y CONFIG_BLK_DEV=y -CONFIG_BLK_DEV_FD=y +# CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set # CONFIG_BLK_DEV_DAC960 is not set @@ -526,7 +734,7 @@ CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_UB is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -536,77 +744,16 @@ CONFIG_MISC_DEVICES=y # CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set +# CONFIG_ACER_WMI is not set +# CONFIG_ASUS_LAPTOP is not set +# CONFIG_FUJITSU_LAPTOP is not set +# CONFIG_MSI_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set # CONFIG_INTEL_MENLOW is not set # CONFIG_ENCLOSURE_SERVICES is not set CONFIG_HAVE_IDE=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDE=y - -# -# Please see Documentation/ide/ide.txt for help/info on IDE drives -# -# CONFIG_BLK_DEV_IDE_SATA is not set -CONFIG_BLK_DEV_IDEDISK=y -CONFIG_IDEDISK_MULTI_MODE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_IDECD_VERBOSE_ERRORS=y -# CONFIG_BLK_DEV_IDETAPE is not set -# CONFIG_BLK_DEV_IDEFLOPPY is not set -# CONFIG_BLK_DEV_IDESCSI is not set -CONFIG_BLK_DEV_IDEACPI=y -# CONFIG_IDE_TASK_IOCTL is not set -CONFIG_IDE_PROC_FS=y - -# -# IDE chipset support/bugfixes -# -CONFIG_IDE_GENERIC=y -# CONFIG_BLK_DEV_PLATFORM is not set -# CONFIG_BLK_DEV_CMD640 is not set -# CONFIG_BLK_DEV_IDEPNP is not set -CONFIG_BLK_DEV_IDEDMA_SFF=y - -# -# PCI IDE chipsets support -# -CONFIG_BLK_DEV_IDEPCI=y -CONFIG_IDEPCI_PCIBUS_ORDER=y -# CONFIG_BLK_DEV_OFFBOARD is not set -# CONFIG_BLK_DEV_GENERIC is not set -# CONFIG_BLK_DEV_OPTI621 is not set -# CONFIG_BLK_DEV_RZ1000 is not set -CONFIG_BLK_DEV_IDEDMA_PCI=y -# CONFIG_BLK_DEV_AEC62XX is not set -# CONFIG_BLK_DEV_ALI15X3 is not set -CONFIG_BLK_DEV_AMD74XX=y -CONFIG_BLK_DEV_ATIIXP=y -# CONFIG_BLK_DEV_CMD64X is not set -# CONFIG_BLK_DEV_TRIFLEX is not set -# CONFIG_BLK_DEV_CY82C693 is not set -# CONFIG_BLK_DEV_CS5520 is not set -# CONFIG_BLK_DEV_CS5530 is not set -# CONFIG_BLK_DEV_HPT34X is not set -# CONFIG_BLK_DEV_HPT366 is not set -# CONFIG_BLK_DEV_JMICRON is not set -# CONFIG_BLK_DEV_SC1200 is not set -CONFIG_BLK_DEV_PIIX=y -# CONFIG_BLK_DEV_IT8213 is not set -# CONFIG_BLK_DEV_IT821X is not set -# CONFIG_BLK_DEV_NS87415 is not set -# CONFIG_BLK_DEV_PDC202XX_OLD is not set -CONFIG_BLK_DEV_PDC202XX_NEW=y -# CONFIG_BLK_DEV_SVWKS is not set -# CONFIG_BLK_DEV_SIIMAGE is not set -# CONFIG_BLK_DEV_SIS5513 is not set -# CONFIG_BLK_DEV_SLC90E66 is not set -# CONFIG_BLK_DEV_TRM290 is not set -# CONFIG_BLK_DEV_VIA82CXXX is not set -# CONFIG_BLK_DEV_TC86C001 is not set -CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_BLK_DEV_HD_ONLY is not set -# CONFIG_BLK_DEV_HD is not set +# CONFIG_IDE is not set # # SCSI device support @@ -615,8 +762,8 @@ CONFIG_BLK_DEV_IDEDMA=y CONFIG_SCSI=y CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set -CONFIG_SCSI_NETLINK=y -# CONFIG_SCSI_PROC_FS is not set +# CONFIG_SCSI_NETLINK is not set +CONFIG_SCSI_PROC_FS=y # # SCSI support type (disk, tape, CD-ROM) @@ -625,7 +772,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set CONFIG_BLK_DEV_SR=y -# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_BLK_DEV_SR_VENDOR=y CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set @@ -642,76 +789,110 @@ CONFIG_SCSI_WAIT_SCAN=m # SCSI Transports # CONFIG_SCSI_SPI_ATTRS=y -CONFIG_SCSI_FC_ATTRS=y +# CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set +# CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set # CONFIG_SCSI_SRP_ATTRS is not set # CONFIG_SCSI_LOWLEVEL is not set +# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y -# CONFIG_SATA_PMP is not set +CONFIG_SATA_PMP=y CONFIG_SATA_AHCI=y # CONFIG_SATA_SIL24 is not set -# CONFIG_ATA_SFF is not set +CONFIG_ATA_SFF=y +# CONFIG_SATA_SVW is not set +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +# CONFIG_SATA_NV is not set +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +# CONFIG_SATA_SIL is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +# CONFIG_SATA_VIA is not set +# CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set +# CONFIG_PATA_ACPI is not set +# CONFIG_PATA_ALI is not set +CONFIG_PATA_AMD=y +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set +# CONFIG_PATA_MPIIX is not set +CONFIG_PATA_OLDPIIX=y +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NINJA32 is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_NS87415 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PCMCIA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set CONFIG_MD=y -# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_LINEAR is not set +# CONFIG_MD_RAID0 is not set +# CONFIG_MD_RAID1 is not set +# CONFIG_MD_RAID10 is not set +# CONFIG_MD_RAID456 is not set +# CONFIG_MD_MULTIPATH is not set +# CONFIG_MD_FAULTY is not set CONFIG_BLK_DEV_DM=y # CONFIG_DM_DEBUG is not set # CONFIG_DM_CRYPT is not set # CONFIG_DM_SNAPSHOT is not set -# CONFIG_DM_MIRROR is not set -# CONFIG_DM_ZERO is not set +CONFIG_DM_MIRROR=y +CONFIG_DM_ZERO=y # CONFIG_DM_MULTIPATH is not set # CONFIG_DM_DELAY is not set # CONFIG_DM_UEVENT is not set -CONFIG_FUSION=y -CONFIG_FUSION_SPI=y -# CONFIG_FUSION_FC is not set -# CONFIG_FUSION_SAS is not set -CONFIG_FUSION_MAX_SGE=128 -# CONFIG_FUSION_CTL is not set -# CONFIG_FUSION_LOGGING is not set +# CONFIG_FUSION is not set # # IEEE 1394 (FireWire) support # # CONFIG_FIREWIRE is not set -CONFIG_IEEE1394=y - -# -# Subsystem Options -# -# CONFIG_IEEE1394_VERBOSEDEBUG is not set - -# -# Controllers -# - -# -# Texas Instruments PCILynx requires I2C -# -CONFIG_IEEE1394_OHCI1394=y - -# -# Protocols -# -# CONFIG_IEEE1394_VIDEO1394 is not set -# CONFIG_IEEE1394_SBP2 is not set -# CONFIG_IEEE1394_ETH1394_ROM_ENTRY is not set -# CONFIG_IEEE1394_ETH1394 is not set -# CONFIG_IEEE1394_DV1394 is not set -CONFIG_IEEE1394_RAWIO=y +# CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y -# CONFIG_MAC_EMUMOUSEBTN is not set +CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -CONFIG_NETDEVICES_MULTIQUEUE=y +# CONFIG_NETDEVICES_MULTIQUEUE is not set +# CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set -CONFIG_TUN=y +# CONFIG_TUN is not set # CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set @@ -722,18 +903,16 @@ CONFIG_MII=y # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set CONFIG_NET_VENDOR_3COM=y -CONFIG_VORTEX=y +# CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set -CONFIG_TULIP=y -# CONFIG_TULIP_MWI is not set -# CONFIG_TULIP_MMIO is not set -# CONFIG_TULIP_NAPI is not set +# CONFIG_TULIP is not set # CONFIG_DE4X5 is not set # CONFIG_WINBOND_840 is not set # CONFIG_DM9102 is not set # CONFIG_ULI526X is not set +# CONFIG_PCMCIA_XIRCOM is not set # CONFIG_HP100 is not set # CONFIG_IBM_NEW_EMAC_ZMII is not set # CONFIG_IBM_NEW_EMAC_RGMII is not set @@ -741,13 +920,9 @@ CONFIG_TULIP=y # CONFIG_IBM_NEW_EMAC_EMAC4 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -CONFIG_AMD8111_ETH=y -# CONFIG_AMD8111E_NAPI is not set +# CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set -CONFIG_B44=y -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y +# CONFIG_B44 is not set CONFIG_FORCEDETH=y # CONFIG_FORCEDETH_NAPI is not set # CONFIG_EEPRO100 is not set @@ -755,9 +930,9 @@ CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set -CONFIG_8139CP=y +# CONFIG_8139CP is not set CONFIG_8139TOO=y -# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_PIO=y # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set # CONFIG_8139_OLD_RX_RESET is not set @@ -783,10 +958,11 @@ CONFIG_E1000=y # CONFIG_R8169 is not set # CONFIG_SIS190 is not set # CONFIG_SKGE is not set -# CONFIG_SKY2 is not set +CONFIG_SKY2=y +# CONFIG_SKY2_DEBUG is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y -CONFIG_BNX2=y +# CONFIG_BNX2 is not set # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set CONFIG_NETDEV_10000=y @@ -794,8 +970,7 @@ CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T3 is not set # CONFIG_IXGBE is not set # CONFIG_IXGB is not set -CONFIG_S2IO=m -# CONFIG_S2IO_NAPI is not set +# CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -803,15 +978,44 @@ CONFIG_S2IO=m # CONFIG_TEHUTI is not set # CONFIG_BNX2X is not set # CONFIG_SFC is not set -# CONFIG_TR is not set +CONFIG_TR=y +# CONFIG_IBMOL is not set +# CONFIG_3C359 is not set +# CONFIG_TMS380TR is not set # # Wireless LAN # # CONFIG_WLAN_PRE80211 is not set -# CONFIG_WLAN_80211 is not set +CONFIG_WLAN_80211=y +# CONFIG_PCMCIA_RAYCS is not set +# CONFIG_IPW2100 is not set +# CONFIG_IPW2200 is not set +# CONFIG_LIBERTAS is not set +# CONFIG_AIRO is not set +# CONFIG_HERMES is not set +# CONFIG_ATMEL is not set +# CONFIG_AIRO_CS is not set +# CONFIG_PCMCIA_WL3501 is not set +# CONFIG_PRISM54 is not set +# CONFIG_USB_ZD1201 is not set +# CONFIG_USB_NET_RNDIS_WLAN is not set +# CONFIG_RTL8180 is not set +# CONFIG_RTL8187 is not set +# CONFIG_ADM8211 is not set +# CONFIG_P54_COMMON is not set +CONFIG_ATH5K=y +# CONFIG_ATH5K_DEBUG is not set # CONFIG_IWLWIFI is not set +# CONFIG_IWLCORE is not set # CONFIG_IWLWIFI_LEDS is not set +# CONFIG_IWL4965 is not set +# CONFIG_IWL3945 is not set +# CONFIG_HOSTAP is not set +# CONFIG_B43 is not set +# CONFIG_B43LEGACY is not set +# CONFIG_ZD1211RW is not set +# CONFIG_RT2X00 is not set # # USB Network Adapters @@ -821,8 +1025,19 @@ CONFIG_S2IO=m # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set # CONFIG_USB_USBNET is not set +CONFIG_NET_PCMCIA=y +# CONFIG_PCMCIA_3C589 is not set +# CONFIG_PCMCIA_3C574 is not set +# CONFIG_PCMCIA_FMVJ18X is not set +# CONFIG_PCMCIA_PCNET is not set +# CONFIG_PCMCIA_NMCLAN is not set +# CONFIG_PCMCIA_SMC91C92 is not set +# CONFIG_PCMCIA_XIRC2PS is not set +# CONFIG_PCMCIA_AXNET is not set # CONFIG_WAN is not set -# CONFIG_FDDI is not set +CONFIG_FDDI=y +# CONFIG_DEFXX is not set +# CONFIG_SKFP is not set # CONFIG_HIPPI is not set # CONFIG_PPP is not set # CONFIG_SLIP is not set @@ -839,14 +1054,14 @@ CONFIG_NET_POLL_CONTROLLER=y # Input device support # CONFIG_INPUT=y -# CONFIG_INPUT_FF_MEMLESS is not set -# CONFIG_INPUT_POLLDEV is not set +CONFIG_INPUT_FF_MEMLESS=y +CONFIG_INPUT_POLLDEV=y # # Userland interfaces # CONFIG_INPUT_MOUSEDEV=y -CONFIG_INPUT_MOUSEDEV_PSAUX=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_JOYDEV is not set @@ -874,17 +1089,62 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set # CONFIG_MOUSE_VSXXXAA is not set -# CONFIG_INPUT_JOYSTICK is not set -# CONFIG_INPUT_TABLET is not set -# CONFIG_INPUT_TOUCHSCREEN is not set -# CONFIG_INPUT_MISC is not set +CONFIG_INPUT_JOYSTICK=y +# CONFIG_JOYSTICK_ANALOG is not set +# CONFIG_JOYSTICK_A3D is not set +# CONFIG_JOYSTICK_ADI is not set +# CONFIG_JOYSTICK_COBRA is not set +# CONFIG_JOYSTICK_GF2K is not set +# CONFIG_JOYSTICK_GRIP is not set +# CONFIG_JOYSTICK_GRIP_MP is not set +# CONFIG_JOYSTICK_GUILLEMOT is not set +# CONFIG_JOYSTICK_INTERACT is not set +# CONFIG_JOYSTICK_SIDEWINDER is not set +# CONFIG_JOYSTICK_TMDC is not set +# CONFIG_JOYSTICK_IFORCE is not set +# CONFIG_JOYSTICK_WARRIOR is not set +# CONFIG_JOYSTICK_MAGELLAN is not set +# CONFIG_JOYSTICK_SPACEORB is not set +# CONFIG_JOYSTICK_SPACEBALL is not set +# CONFIG_JOYSTICK_STINGER is not set +# CONFIG_JOYSTICK_TWIDJOY is not set +# CONFIG_JOYSTICK_ZHENHUA is not set +# CONFIG_JOYSTICK_JOYDUMP is not set +# CONFIG_JOYSTICK_XPAD is not set +CONFIG_INPUT_TABLET=y +# CONFIG_TABLET_USB_ACECAD is not set +# CONFIG_TABLET_USB_AIPTEK is not set +# CONFIG_TABLET_USB_GTCO is not set +# CONFIG_TABLET_USB_KBTAB is not set +# CONFIG_TABLET_USB_WACOM is not set +CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_FUJITSU is not set +# CONFIG_TOUCHSCREEN_GUNZE is not set +# CONFIG_TOUCHSCREEN_ELO is not set +# CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_MK712 is not set +# CONFIG_TOUCHSCREEN_PENMOUNT is not set +# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set +# CONFIG_TOUCHSCREEN_TOUCHWIN is not set +# CONFIG_TOUCHSCREEN_UCB1400 is not set +# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +CONFIG_INPUT_MISC=y +# CONFIG_INPUT_PCSPKR is not set +# CONFIG_INPUT_APANEL is not set +# CONFIG_INPUT_ATLAS_BTNS is not set +# CONFIG_INPUT_ATI_REMOTE is not set +# CONFIG_INPUT_ATI_REMOTE2 is not set +# CONFIG_INPUT_KEYSPAN_REMOTE is not set +# CONFIG_INPUT_POWERMATE is not set +# CONFIG_INPUT_YEALINK is not set +# CONFIG_INPUT_UINPUT is not set # # Hardware I/O ports # CONFIG_SERIO=y CONFIG_SERIO_I8042=y -# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_SERPORT=y # CONFIG_SERIO_CT82C710 is not set # CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y @@ -897,9 +1157,25 @@ CONFIG_SERIO_LIBPS2=y CONFIG_VT=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y -# CONFIG_VT_HW_CONSOLE_BINDING is not set -# CONFIG_DEVKMEM is not set -# CONFIG_SERIAL_NONSTANDARD is not set +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_DEVKMEM=y +CONFIG_SERIAL_NONSTANDARD=y +# CONFIG_COMPUTONE is not set +# CONFIG_ROCKETPORT is not set +# CONFIG_CYCLADES is not set +# CONFIG_DIGIEPCA is not set +# CONFIG_MOXA_INTELLIO is not set +# CONFIG_MOXA_SMARTIO is not set +# CONFIG_ISI is not set +# CONFIG_SYNCLINK is not set +# CONFIG_SYNCLINKMP is not set +# CONFIG_SYNCLINK_GT is not set +# CONFIG_N_HDLC is not set +# CONFIG_RISCOM8 is not set +# CONFIG_SPECIALIX is not set +# CONFIG_SX is not set +# CONFIG_RIO is not set +# CONFIG_STALDRV is not set # CONFIG_NOZOMI is not set # @@ -910,9 +1186,14 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y -CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_CS is not set +CONFIG_SERIAL_8250_NR_UARTS=32 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -# CONFIG_SERIAL_8250_EXTENDED is not set +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_DETECT_IRQ=y +CONFIG_SERIAL_8250_RSA=y # # Non-8250 serial port support @@ -921,28 +1202,78 @@ CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y -CONFIG_HW_RANDOM_INTEL=y -CONFIG_HW_RANDOM_AMD=y -# CONFIG_NVRAM is not set -CONFIG_RTC=y +# CONFIG_HW_RANDOM_INTEL is not set +# CONFIG_HW_RANDOM_AMD is not set +CONFIG_NVRAM=y # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set + +# +# PCMCIA character devices +# +# CONFIG_SYNCLINK_CS is not set +# CONFIG_CARDMAN_4000 is not set +# CONFIG_CARDMAN_4040 is not set +# CONFIG_IPWIRELESS is not set # CONFIG_MWAVE is not set # CONFIG_PC8736x_GPIO is not set -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=256 +# CONFIG_RAW_DRIVER is not set CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set -CONFIG_HPET_MMAP=y +# CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y -# CONFIG_I2C is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +# CONFIG_I2C_CHARDEV is not set + +# +# I2C Hardware Bus support +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +CONFIG_I2C_I801=y +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PROSAVAGE is not set +# CONFIG_I2C_SAVAGE4 is not set +# CONFIG_I2C_SIMTEC is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_TINY_USB is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +# CONFIG_I2C_PCA_PLATFORM is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_EEPROM is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set # CONFIG_W1 is not set CONFIG_POWER_SUPPLY=y @@ -951,20 +1282,55 @@ CONFIG_POWER_SUPPLY=y # CONFIG_BATTERY_DS2760 is not set # CONFIG_HWMON is not set CONFIG_THERMAL=y -# CONFIG_WATCHDOG is not set +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set + +# +# Watchdog Device Drivers +# +# CONFIG_SOFT_WATCHDOG is not set +# CONFIG_ACQUIRE_WDT is not set +# CONFIG_ADVANTECH_WDT is not set +# CONFIG_ALIM1535_WDT is not set +# CONFIG_ALIM7101_WDT is not set +# CONFIG_SC520_WDT is not set +# CONFIG_EUROTECH_WDT is not set +# CONFIG_IB700_WDT is not set +# CONFIG_IBMASR is not set +# CONFIG_WAFER_WDT is not set +# CONFIG_I6300ESB_WDT is not set +# CONFIG_ITCO_WDT is not set +# CONFIG_IT8712F_WDT is not set +# CONFIG_HP_WATCHDOG is not set +# CONFIG_SC1200_WDT is not set +# CONFIG_PC87413_WDT is not set +# CONFIG_60XX_WDT is not set +# CONFIG_SBC8360_WDT is not set +# CONFIG_CPU5_WDT is not set +# CONFIG_SMSC37B787_WDT is not set +# CONFIG_W83627HF_WDT is not set +# CONFIG_W83697HF_WDT is not set +# CONFIG_W83877F_WDT is not set +# CONFIG_W83977F_WDT is not set +# CONFIG_MACHZ_WDT is not set +# CONFIG_SBC_EPX_C3_WATCHDOG is not set + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set # # Sonics Silicon Backplane # CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=y -CONFIG_SSB_SPROM=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -# CONFIG_SSB_B43_PCI_BRIDGE is not set -# CONFIG_SSB_DEBUG is not set -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y +# CONFIG_SSB is not set # # Multifunction device drivers @@ -996,11 +1362,81 @@ CONFIG_AGP_AMD64=y CONFIG_AGP_INTEL=y # CONFIG_AGP_SIS is not set # CONFIG_AGP_VIA is not set -# CONFIG_DRM is not set +CONFIG_DRM=y +# CONFIG_DRM_TDFX is not set +# CONFIG_DRM_R128 is not set +# CONFIG_DRM_RADEON is not set +# CONFIG_DRM_I810 is not set +# CONFIG_DRM_I830 is not set +CONFIG_DRM_I915=y +# CONFIG_DRM_MGA is not set +# CONFIG_DRM_SIS is not set +# CONFIG_DRM_VIA is not set +# CONFIG_DRM_SAVAGE is not set # CONFIG_VGASTATE is not set # CONFIG_VIDEO_OUTPUT_CONTROL is not set -# CONFIG_FB is not set -# CONFIG_BACKLIGHT_LCD_SUPPORT is not set +CONFIG_FB=y +# CONFIG_FIRMWARE_EDID is not set +# CONFIG_FB_DDC is not set +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set +# CONFIG_FB_SYS_FILLRECT is not set +# CONFIG_FB_SYS_COPYAREA is not set +# CONFIG_FB_SYS_IMAGEBLIT is not set +# CONFIG_FB_FOREIGN_ENDIAN is not set +# CONFIG_FB_SYS_FOPS is not set +CONFIG_FB_DEFERRED_IO=y +# CONFIG_FB_SVGALIB is not set +# CONFIG_FB_MACMODES is not set +# CONFIG_FB_BACKLIGHT is not set +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +# CONFIG_FB_VESA is not set +CONFIG_FB_EFI=y +# CONFIG_FB_IMAC is not set +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_GEODE is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +# CONFIG_LCD_CLASS_DEVICE is not set +CONFIG_BACKLIGHT_CLASS_DEVICE=y +# CONFIG_BACKLIGHT_CORGI is not set +# CONFIG_BACKLIGHT_PROGEAR is not set # # Display device support @@ -1012,9 +1448,14 @@ CONFIG_AGP_INTEL=y # CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_LOGO_LINUX_CLUT224=y # # Sound @@ -1024,35 +1465,165 @@ CONFIG_SOUND=y # # Advanced Linux Sound Architecture # -# CONFIG_SND is not set +CONFIG_SND=y +CONFIG_SND_TIMER=y +CONFIG_SND_PCM=y +CONFIG_SND_HWDEP=y +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQ_DUMMY=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=y +CONFIG_SND_PCM_OSS=y +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_SUPPORT_OLD_API=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y + +# +# Generic devices +# +# CONFIG_SND_PCSP is not set +# CONFIG_SND_DUMMY is not set +# CONFIG_SND_VIRMIDI is not set +# CONFIG_SND_MTPAV is not set +# CONFIG_SND_SERIAL_U16550 is not set +# CONFIG_SND_MPU401 is not set + +# +# PCI devices +# +# CONFIG_SND_AD1889 is not set +# CONFIG_SND_ALS300 is not set +# CONFIG_SND_ALS4000 is not set +# CONFIG_SND_ALI5451 is not set +# CONFIG_SND_ATIIXP is not set +# CONFIG_SND_ATIIXP_MODEM is not set +# CONFIG_SND_AU8810 is not set +# CONFIG_SND_AU8820 is not set +# CONFIG_SND_AU8830 is not set +# CONFIG_SND_AW2 is not set +# CONFIG_SND_AZT3328 is not set +# CONFIG_SND_BT87X is not set +# CONFIG_SND_CA0106 is not set +# CONFIG_SND_CMIPCI is not set +# CONFIG_SND_OXYGEN is not set +# CONFIG_SND_CS4281 is not set +# CONFIG_SND_CS46XX is not set +# CONFIG_SND_CS5530 is not set +# CONFIG_SND_DARLA20 is not set +# CONFIG_SND_GINA20 is not set +# CONFIG_SND_LAYLA20 is not set +# CONFIG_SND_DARLA24 is not set +# CONFIG_SND_GINA24 is not set +# CONFIG_SND_LAYLA24 is not set +# CONFIG_SND_MONA is not set +# CONFIG_SND_MIA is not set +# CONFIG_SND_ECHO3G is not set +# CONFIG_SND_INDIGO is not set +# CONFIG_SND_INDIGOIO is not set +# CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_EMU10K1 is not set +# CONFIG_SND_EMU10K1X is not set +# CONFIG_SND_ENS1370 is not set +# CONFIG_SND_ENS1371 is not set +# CONFIG_SND_ES1938 is not set +# CONFIG_SND_ES1968 is not set +# CONFIG_SND_FM801 is not set +CONFIG_SND_HDA_INTEL=y +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_CODEC_REALTEK=y +CONFIG_SND_HDA_CODEC_ANALOG=y +CONFIG_SND_HDA_CODEC_SIGMATEL=y +CONFIG_SND_HDA_CODEC_VIA=y +CONFIG_SND_HDA_CODEC_ATIHDMI=y +CONFIG_SND_HDA_CODEC_CONEXANT=y +CONFIG_SND_HDA_CODEC_CMEDIA=y +CONFIG_SND_HDA_CODEC_SI3054=y +CONFIG_SND_HDA_GENERIC=y +# CONFIG_SND_HDA_POWER_SAVE is not set +# CONFIG_SND_HDSP is not set +# CONFIG_SND_HDSPM is not set +# CONFIG_SND_HIFIER is not set +# CONFIG_SND_ICE1712 is not set +# CONFIG_SND_ICE1724 is not set +# CONFIG_SND_INTEL8X0 is not set +# CONFIG_SND_INTEL8X0M is not set +# CONFIG_SND_KORG1212 is not set +# CONFIG_SND_MAESTRO3 is not set +# CONFIG_SND_MIXART is not set +# CONFIG_SND_NM256 is not set +# CONFIG_SND_PCXHR is not set +# CONFIG_SND_RIPTIDE is not set +# CONFIG_SND_RME32 is not set +# CONFIG_SND_RME96 is not set +# CONFIG_SND_RME9652 is not set +# CONFIG_SND_SONICVIBES is not set +# CONFIG_SND_TRIDENT is not set +# CONFIG_SND_VIA82XX is not set +# CONFIG_SND_VIA82XX_MODEM is not set +# CONFIG_SND_VIRTUOSO is not set +# CONFIG_SND_VX222 is not set +# CONFIG_SND_YMFPCI is not set + +# +# USB devices +# +# CONFIG_SND_USB_AUDIO is not set +# CONFIG_SND_USB_USX2Y is not set +# CONFIG_SND_USB_CAIAQ is not set + +# +# PCMCIA devices +# +# CONFIG_SND_VXPOCKET is not set +# CONFIG_SND_PDAUDIOCF is not set + +# +# System on Chip audio support +# +# CONFIG_SND_SOC is not set + +# +# ALSA SoC audio for Freescale SOCs +# + +# +# SoC Audio for the Texas Instruments OMAP +# # # Open Sound System # -CONFIG_SOUND_PRIME=y -# CONFIG_SOUND_TRIDENT is not set -# CONFIG_SOUND_MSNDCLAS is not set -# CONFIG_SOUND_MSNDPIN is not set -# CONFIG_SOUND_OSS is not set +# CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y -# CONFIG_HID_DEBUG is not set +CONFIG_HID_DEBUG=y CONFIG_HIDRAW=y # # USB Input Devices # CONFIG_USB_HID=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set -# CONFIG_HID_FF is not set -# CONFIG_USB_HIDDEV is not set +CONFIG_USB_HIDINPUT_POWERBOOK=y +CONFIG_HID_FF=y +CONFIG_HID_PID=y +CONFIG_LOGITECH_FF=y +# CONFIG_LOGIRUMBLEPAD2_FF is not set +CONFIG_PANTHERLORD_FF=y +CONFIG_THRUSTMASTER_FF=y +CONFIG_ZEROPLUS_FF=y +CONFIG_USB_HIDDEV=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y CONFIG_USB=y -# CONFIG_USB_DEBUG is not set -# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set +CONFIG_USB_DEBUG=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # # Miscellaneous USB options @@ -1060,7 +1631,7 @@ CONFIG_USB=y CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set -# CONFIG_USB_SUSPEND is not set +CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set # @@ -1073,7 +1644,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_ISP116X_HCD is not set # CONFIG_USB_ISP1760_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_HCD_SSB is not set # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set # CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y @@ -1108,7 +1678,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set -# CONFIG_USB_LIBUSUAL is not set +CONFIG_USB_LIBUSUAL=y # # USB Imaging devices @@ -1148,11 +1718,78 @@ CONFIG_USB_MON=y # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set -# CONFIG_NEW_LEDS is not set +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y + +# +# LED drivers +# +# CONFIG_LEDS_CLEVO_MAIL is not set + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +# CONFIG_LEDS_TRIGGER_TIMER is not set +# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set +# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set -# CONFIG_EDAC is not set -# CONFIG_RTC_CLASS is not set +CONFIG_EDAC=y + +# +# Reporting subsystems +# +# CONFIG_EDAC_DEBUG is not set +# CONFIG_EDAC_MM_EDAC is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +# CONFIG_RTC_HCTOSYS is not set +# CONFIG_RTC_DEBUG is not set + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +# CONFIG_RTC_DRV_DS1307 is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_S35390A is not set + +# +# SPI RTC drivers +# + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_V3020 is not set + +# +# on-CPU RTC drivers +# CONFIG_DMADEVICES=y # @@ -1165,6 +1802,7 @@ CONFIG_DMADEVICES=y # Firmware Drivers # # CONFIG_EDD is not set +CONFIG_EFI_VARS=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y @@ -1173,25 +1811,16 @@ CONFIG_DMIID=y # # File systems # -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -# CONFIG_EXT2_FS_SECURITY is not set -# CONFIG_EXT2_FS_XIP is not set +# CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y -# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_EXT3_FS_SECURITY=y # CONFIG_EXT4DEV_FS is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=y -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -# CONFIG_REISERFS_FS_SECURITY is not set +# CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set @@ -1200,7 +1829,12 @@ CONFIG_FS_POSIX_ACL=y CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y -# CONFIG_QUOTA is not set +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QFMT_V1 is not set +CONFIG_QFMT_V2=y +CONFIG_QUOTACTL=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set @@ -1211,7 +1845,7 @@ CONFIG_GENERIC_ACL=y # CONFIG_ISO9660_FS=y CONFIG_JOLIET=y -# CONFIG_ZISOFS is not set +CONFIG_ZISOFS=y # CONFIG_UDF_FS is not set # @@ -1229,6 +1863,7 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" # CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y @@ -1242,6 +1877,7 @@ CONFIG_HUGETLB_PAGE=y # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +# CONFIG_ECRYPT_FS is not set # CONFIG_HFS_FS is not set # CONFIG_HFSPLUS_FS is not set # CONFIG_BEFS_FS is not set @@ -1255,15 +1891,38 @@ CONFIG_HUGETLB_PAGE=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set -# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NETWORK_FILESYSTEMS=y +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set # # Partition Types # -# CONFIG_PARTITION_ADVANCED is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set CONFIG_NLS=y -CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y # CONFIG_NLS_CODEPAGE_737 is not set # CONFIG_NLS_CODEPAGE_775 is not set @@ -1298,7 +1957,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_ISO8859_9 is not set # CONFIG_NLS_ISO8859_13 is not set # CONFIG_NLS_ISO8859_14 is not set -CONFIG_NLS_ISO8859_15=y +# CONFIG_NLS_ISO8859_15 is not set # CONFIG_NLS_KOI8_R is not set # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y @@ -1309,21 +1968,22 @@ CONFIG_NLS_UTF8=y # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set -CONFIG_ENABLE_WARN_DEPRECATED=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y -CONFIG_UNUSED_SYMBOLS=y +# CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set -CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_DETECT_SOFTLOCKUP is not set # CONFIG_SCHED_DEBUG is not set -# CONFIG_SCHEDSTATS is not set +CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y # CONFIG_DEBUG_OBJECTS is not set -# CONFIG_DEBUG_SLAB is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set @@ -1340,7 +2000,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_WRITECOUNT is not set # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set -# CONFIG_FRAME_POINTER is not set +CONFIG_FRAME_POINTER=y # CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_KPROBES_SANITY_TEST is not set @@ -1348,20 +2008,21 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set # CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_NONPROMISC_DEVMEM is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set -# CONFIG_DEBUG_RODATA is not set +CONFIG_DEBUG_RODATA=y # CONFIG_DIRECT_GBPAGES is not set -# CONFIG_DEBUG_NX_TEST is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_DEBUG_NX_TEST=m CONFIG_X86_MPPARSE=y # CONFIG_IOMMU_DEBUG is not set CONFIG_IO_DELAY_TYPE_0X80=0 @@ -1373,18 +2034,115 @@ CONFIG_IO_DELAY_0X80=y # CONFIG_IO_DELAY_UDELAY is not set # CONFIG_IO_DELAY_NONE is not set CONFIG_DEFAULT_IO_DELAY_TYPE=0 -# CONFIG_DEBUG_BOOT_PARAMS is not set +CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set # # Security options # -# CONFIG_KEYS is not set -# CONFIG_SECURITY is not set -# CONFIG_SECURITY_FILE_CAPABILITIES is not set -# CONFIG_CRYPTO is not set -# CONFIG_HAVE_KVM is not set -# CONFIG_VIRTUALIZATION is not set +CONFIG_KEYS=y +CONFIG_KEYS_DEBUG_PROC_KEYS=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +# CONFIG_SECURITY_NETWORK_XFRM is not set +CONFIG_SECURITY_CAPABILITIES=y +CONFIG_SECURITY_FILE_CAPABILITIES=y +# CONFIG_SECURITY_ROOTPLUG is not set +CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 +CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 +# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set +# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set +# CONFIG_SECURITY_SMACK is not set +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_MANAGER=y +# CONFIG_CRYPTO_GF128MUL is not set +# CONFIG_CRYPTO_NULL is not set +# CONFIG_CRYPTO_CRYPTD is not set +CONFIG_CRYPTO_AUTHENC=y +# CONFIG_CRYPTO_TEST is not set + +# +# Authenticated Encryption with Associated Data +# +# CONFIG_CRYPTO_CCM is not set +# CONFIG_CRYPTO_GCM is not set +# CONFIG_CRYPTO_SEQIV is not set + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +# CONFIG_CRYPTO_CTR is not set +# CONFIG_CRYPTO_CTS is not set +CONFIG_CRYPTO_ECB=y +# CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_PCBC is not set +# CONFIG_CRYPTO_XTS is not set + +# +# Hash modes +# +CONFIG_CRYPTO_HMAC=y +# CONFIG_CRYPTO_XCBC is not set + +# +# Digest +# +# CONFIG_CRYPTO_CRC32C is not set +# CONFIG_CRYPTO_MD4 is not set +CONFIG_CRYPTO_MD5=y +# CONFIG_CRYPTO_MICHAEL_MIC is not set +CONFIG_CRYPTO_SHA1=y +# CONFIG_CRYPTO_SHA256 is not set +# CONFIG_CRYPTO_SHA512 is not set +# CONFIG_CRYPTO_TGR192 is not set +# CONFIG_CRYPTO_WP512 is not set + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +# CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_ANUBIS is not set +CONFIG_CRYPTO_ARC4=y +# CONFIG_CRYPTO_BLOWFISH is not set +# CONFIG_CRYPTO_CAMELLIA is not set +# CONFIG_CRYPTO_CAST5 is not set +# CONFIG_CRYPTO_CAST6 is not set +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_FCRYPT is not set +# CONFIG_CRYPTO_KHAZAD is not set +# CONFIG_CRYPTO_SALSA20 is not set +# CONFIG_CRYPTO_SALSA20_X86_64 is not set +# CONFIG_CRYPTO_SEED is not set +# CONFIG_CRYPTO_SERPENT is not set +# CONFIG_CRYPTO_TEA is not set +# CONFIG_CRYPTO_TWOFISH is not set +# CONFIG_CRYPTO_TWOFISH_X86_64 is not set + +# +# Compression +# +# CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_LZO is not set +CONFIG_CRYPTO_HW=y +# CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_HAVE_KVM=y +CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set -- cgit v1.1 From 205f93288093df69f9ab5f6981aef27b91088b28 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@codemonkey.org.uk> Date: Mon, 5 May 2008 17:52:52 -0400 Subject: x86: add new cache descriptor The latest rev of Intel doc AP-485 details a new cache descriptor that we don't yet support. A 6MB 24-way assoc L2 cache. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 26d615d..2c8afaf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata = { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ -- cgit v1.1 From 0327318445d55808991a63137cfb698a90ab6adf Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Fri, 18 Apr 2008 17:49:15 -0700 Subject: x86_64: simplify the memtest parameter setting use CONFIG_MEMTEST only. if it is set, will have memtest=0 (disabled) need to have memtest=4 in command line to test more patterns. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 30 +++++++----------------------- arch/x86/mm/init_64.c | 5 +++-- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae..a1a9021 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -419,35 +419,19 @@ config PARAVIRT endif -config MEMTEST_BOOTPARAM - bool "Memtest boot parameter" +config MEMTEST + bool "Memtest" depends on X86_64 default y help This option adds a kernel parameter 'memtest', which allows memtest - to be disabled at boot. If this option is selected, memtest - functionality can be disabled with memtest=0 on the kernel - command line. The purpose of this option is to allow a single - kernel image to be distributed with memtest built in, but not - necessarily enabled. - + to be set. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=4, mean do 4 test patterns. If you are unsure how to answer this question, answer Y. -config MEMTEST_BOOTPARAM_VALUE - int "Memtest boot parameter default value (0-4)" - depends on MEMTEST_BOOTPARAM - range 0 4 - default 0 - help - This option sets the default value for the kernel parameter - 'memtest', which allows memtest to be disabled at boot. If this - option is set to 0 (zero), the memtest kernel parameter will - default to 0, disabling memtest at bootup. If this option is - set to 4, the memtest kernel parameter will default to 4, - enabling memtest at bootup, and use that as pattern number. - - If you are unsure how to answer this question, answer 0. - config ACPI_SRAT def_bool y depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b..c6d8131 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -431,7 +431,7 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -#ifdef CONFIG_MEMTEST_BOOTPARAM +#ifdef CONFIG_MEMTEST static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern) @@ -493,7 +493,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, } -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; +/* default is disabled */ +static int memtest_pattern __initdata; static int __init parse_memtest(char *arg) { -- cgit v1.1 From dedd4915af40cff6614707c50dcae43d17beadec Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sat, 17 May 2008 08:28:33 +0200 Subject: bitops: fix build in struct thread_info we can move flags from u32 to natural size - assembly code uses offsetof. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 74481b7..25d7105 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -24,10 +24,10 @@ struct exec_domain; struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - __u32 flags; /* low level flags */ + unsigned long flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; struct restart_block restart_block; -- cgit v1.1 From b9ada4281c48076bdb252813be24ddb57e17cade Mon Sep 17 00:00:00 2001 From: Andy Whitcroft <apw@shadowen.org> Date: Tue, 20 May 2008 11:01:08 +0100 Subject: x86: reinstate numa remap for SPARSEMEM on x86 NUMA systems Recent kernels have been panic'ing trying to allocate memory early in boot, in __alloc_pages: BUG: unable to handle kernel paging request at 00001568 IP: [<c10407b6>] __alloc_pages+0x33/0x2cc *pdpt = 00000000013a5001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 1, comm: swapper Not tainted (2.6.25 #78) EIP: 0060:[<c10407b6>] EFLAGS: 00010246 CPU: 0 EIP is at __alloc_pages+0x33/0x2cc EAX: 00001564 EBX: 000412d0 ECX: 00001564 EDX: 000005c3 ESI: f78012a0 EDI: 00000001 EBP: 00001564 ESP: f7871e50 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f7870000 task=f786f670 task.ti=f7870000) Stack: 00000000 f786f670 00000010 00000000 0000b700 000412d0 f78012a0 00000001 00000000 c105b64d 00000000 000412d0 f78012a0 f7803120 00000000 c105c1c5 00000010 f7803144 000412d0 00000001 f7803130 f7803120 f78012a0 00000001 Call Trace: [<c105b64d>] kmem_getpages+0x94/0x129 [<c105c1c5>] cache_grow+0x8f/0x123 [<c105c689>] ____cache_alloc_node+0xb9/0xe4 [<c105c999>] kmem_cache_alloc_node+0x92/0xd2 [<c1018929>] build_sched_domains+0x536/0x70d [<c100b63c>] do_flush_tlb_all+0x0/0x3f [<c100b63c>] do_flush_tlb_all+0x0/0x3f [<c10572d6>] interleave_nodes+0x23/0x5a [<c105c44f>] alternate_node_alloc+0x43/0x5b [<c1018b47>] arch_init_sched_domains+0x46/0x51 [<c136e85e>] kernel_init+0x0/0x82 [<c137ac19>] sched_init_smp+0x10/0xbb [<c136e8a1>] kernel_init+0x43/0x82 [<c10035cf>] kernel_thread_helper+0x7/0x10 Debugging this showed that the NODE_DATA() for nodes other than node 0 were all NULL. Tracing this back showed that the NODE_DATA() pointers were being initialised to each nodes remap space. However under SPARSEMEM remap is disabled which leads to the pgdat's being placed incorrectly at kernel virtual address 0. Leading to the panic when attempting to allocate memory from these nodes. Numa remap was disabled in the commit below. This occured while fixing problems triggered when attempting to boot x86_32 NUMA SPARSEMEM kernels on non-numa hardware. x86: make NUMA work on 32-bit commit 1b000a5dbeb2f34bc03d45ebdf3f6d24a60c3aed The real problem is believed to be related to other alignment issues in the regions blocked out from the bootmem allocator for small memory systems, and has been fixed separately. Therefore re-enable remap for SPARSMEM, which fixes pgdat allocation issues. Testing confirms that SPARSMEM NUMA kernels will boot correctly with this part of the change reverted. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf9..026201f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -164,16 +164,13 @@ static void __init allocate_pgdat(int nid) } } -#ifdef CONFIG_DISCONTIGMEM /* - * In the discontig memory model, a portion of the kernel virtual area (KVA) - * is reserved and portions of nodes are mapped using it. This is to allow - * node-local memory to be allocated for structures that would normally require - * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers - * should be prepared to allocate from the bootmem allocator instead. This KVA - * mechanism is incompatible with SPARSEMEM as it makes assumptions about the - * layout of memory that are broken if alloc_remap() succeeds for some of the - * map and fails for others + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; @@ -290,25 +287,6 @@ static void init_remap_allocator(int nid) (ulong) pfn_to_kaddr(highstart_pfn + node_remap_offset[nid] + node_remap_size[nid])); } -#else -void *alloc_remap(int nid, unsigned long size) -{ - return NULL; -} - -static unsigned long calculate_numa_remap_pages(void) -{ - return 0; -} - -static void init_remap_allocator(int nid) -{ -} - -void __init remap_numa_kva(void) -{ -} -#endif /* CONFIG_DISCONTIGMEM */ extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) -- cgit v1.1 From 84d6bd0e272e6eace1e7e4c501f32bddfb665bb2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft <apw@shadowen.org> Date: Tue, 20 May 2008 11:01:19 +0100 Subject: x86: cope with no remap space being allocated for a numa node When allocating the pgdat's for numa nodes on x86_32 we attempt to place them in the numa remap space for that node. However should the node not have any remap space allocated (such as due to having non-ram pages in the remap location in the node) then we will incorrectly place the pgdat at zero. Check we have remap available, falling back to node 0 memory where we do not. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 026201f..435c343 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,7 +156,7 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid)) + if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); -- cgit v1.1 From 0abbc78a0137fee60ef092f0b20a3d3d7e7e0cc2 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@ucw.cz> Date: Tue, 20 May 2008 16:27:17 +0200 Subject: x86, aperture_64: use symbolic constants Factor-out common aperture_valid code. Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 23 +---------------------- drivers/char/agp/amd64-agp.c | 22 ++++------------------ include/asm-x86/gart.h | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 02f4dba..5373f78 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,27 +109,6 @@ static u32 __init allocate_aperture(void) return (u32)__pa(p); } -static int __init aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) -{ - if (!aper_base) - return 0; - - if (aper_base + aper_size > 0x100000000UL) { - printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); - return 0; - } - if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } - if (aper_size < min_size) { - printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", - aper_size>>20, min_size>>20); - return 0; - } - - return 1; -} /* Find a PCI capability */ static __u32 __init find_cap(int bus, int slot, int func, int cap) @@ -344,7 +323,7 @@ out: if (gart_fix_e820 && !fix && aper_enabled) { if (!e820_all_mapped(aper_base, aper_base + aper_size, E820_RESERVED)) { - /* reserved it, so we can resuse it in second kernel */ + /* reserve it, so we can reuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); update_e820(); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e3c7ea0..f5af65a 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -228,24 +228,10 @@ static const struct agp_bridge_driver amd_8151_driver = { }; /* Some basic sanity checks for the aperture. */ -static int __devinit aperture_valid(u64 aper, u32 size) +static int __devinit agp_aperture_valid(u64 aper, u32 size) { - if (aper == 0) { - printk(KERN_ERR PFX "No aperture\n"); + if (!aperture_valid(aper, size, 32*1024*1024)) return 0; - } - if ((u64)aper + size > 0x100000000ULL) { - printk(KERN_ERR PFX "Aperture out of bounds\n"); - return 0; - } - if (e820_any_mapped(aper, aper + size, E820_RAM)) { - printk(KERN_ERR PFX "Aperture pointing to RAM\n"); - return 0; - } - if (size < 32*1024*1024) { - printk(KERN_ERR PFX "Aperture too small (%d MB)\n", size>>20); - return 0; - } /* Request the Aperture. This catches cases when someone else already put a mapping in there - happens with some very broken BIOS @@ -282,7 +268,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, nb_order = (nb_order >> 1) & 7; pci_read_config_dword(nb, AMD64_GARTAPERTUREBASE, &nb_base); nb_aper = nb_base << 25; - if (aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) { + if (agp_aperture_valid(nb_aper, (32*1024*1024)<<nb_order)) { return 0; } @@ -313,7 +299,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp, } printk(KERN_INFO PFX "Aperture from AGP @ %Lx size %u MB\n", aper, 32 << order); - if (order < 0 || !aperture_valid(aper, (32*1024*1024)<<order)) + if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order)) return -1; pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 6f22786..c818b96 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -1,6 +1,8 @@ #ifndef _ASM_X8664_IOMMU_H #define _ASM_X8664_IOMMU_H 1 +#include <asm/e820.h> + extern void pci_iommu_shutdown(void); extern void no_iommu_init(void); extern int force_iommu, no_iommu; @@ -69,4 +71,26 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } +static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) +{ + if (!aper_base) + return 0; + + if (aper_base + aper_size > 0x100000000ULL) { + printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); + return 0; + } + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < min_size) { + printk(KERN_ERR "Aperture too small (%d MB) than (%d MB)\n", + aper_size>>20, min_size>>20); + return 0; + } + + return 1; +} + #endif -- cgit v1.1 From 0748aca6f0af917591dcfd70dccac770a25d4f71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Tue, 13 May 2008 10:34:20 +0200 Subject: x86: remove useless static current_tsc_khz variable current_tsc_khz is just written by the init code and never used again. Remove it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/tsc_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index e479072..d53b104 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -283,7 +283,6 @@ core_initcall(cpufreq_tsc); /* clock source code */ -static unsigned long current_tsc_khz; static struct clocksource clocksource_tsc; /* @@ -434,9 +433,8 @@ void __init tsc_init(void) unsynchronized_tsc(); check_geode_tsc_reliable(); - current_tsc_khz = tsc_khz; - clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, - clocksource_tsc.shift); + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; -- cgit v1.1 From 3843fc2575e3389f4f0ad0420a720240a5746a5d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Fri, 9 May 2008 12:05:57 +0100 Subject: xen: remove support for non-PAE 32-bit Non-PAE operation has been deprecated in Xen for a while, and is rarely tested or used. xen-unstable has now officially dropped non-PAE support. Since Xen/pvops' non-PAE support has also been broken for a while, we may as well completely drop it altogether. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 51 ++++++++++++++++------------------------------ arch/x86/xen/mmu.c | 19 ++--------------- arch/x86/xen/mmu.h | 24 ++++++---------------- arch/x86/xen/xen-head.S | 4 ---- include/asm-x86/xen/page.h | 4 ---- 6 files changed, 27 insertions(+), 77 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be..525b108 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..a05b772 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowmem_page_readonly(pmd); - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); } /* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, .set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, -#endif /* PAE */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 126766d..07c2653 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -222,7 +222,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; return ret; } -#ifdef CONFIG_X86_PAE + void xen_set_pud(pud_t *ptr, pud_t val) { struct multicall_space mcs; @@ -272,12 +272,6 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} -#endif /* CONFIG_X86_PAE */ /* (Yet another) pagetable walker. This one is intended for pinning a @@ -430,8 +424,6 @@ static int pin_page(struct page *page, enum pt_level level) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - unsigned level; - xen_mc_batch(); if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -441,14 +433,7 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - xen_do_pin(level, PFN_DOWN(__pa(pgd))); - + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b..5fe961c 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587..2ab5f42 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -30,11 +30,7 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") #endif /*CONFIG_XEN */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4d..e11f240 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x) return (pte_t) { .pte = x }; } -#ifdef CONFIG_X86_PAE #define pmd_val_ma(v) ((v).pmd) #define pud_val_ma(v) ((v).pgd.pgd) #define __pmd_ma(x) ((pmd_t) { (x) } ) -#else /* !X86_PAE */ -#define pmd_val_ma(v) ((v).pud.pgd.pgd) -#endif /* CONFIG_X86_PAE */ #define pgd_val_ma(x) ((x).pgd) -- cgit v1.1 From b478458aeebfc55fe409abec43794ac72a623c79 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: avoid re-loading LDT in unrelated address spaces Performance optimization. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 0224c36..21f2bae 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,9 +20,9 @@ #include <asm/mmu_context.h> #ifdef CONFIG_SMP -static void flush_ldt(void *null) +static void flush_ldt(void *current_mm) { - if (current->active_mm) + if (current->active_mm == current_mm) load_LDT(¤t->active_mm->context); } #endif @@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); + smp_call_function(flush_ldt, current->mm, 1, 1); preempt_enable(); #else load_LDT(pc); -- cgit v1.1 From 873b274a41c0cfe58b2eb0a7722424eb367b905b Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 22 May 2008 13:02:23 -0400 Subject: x86: Add Centaur and Transmeta CPUs to PAT whitelist Unconditionally enable PAT support on Centaur and Transmeta CPUs. All known models that advertise PAT have no known errata. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/addon_cpuid_features.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c2e1ce3..d8b3e4a 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -62,6 +62,9 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) return; break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_TRANSMETA: + return; } pat_disable(cpu_has_pat ? -- cgit v1.1 From c3ed64295f0fed9131b42122234b1ce4a4a266cf Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Fri, 16 May 2008 10:44:39 -0700 Subject: x86: change maximum NR_CPUS to 4096 and MAX_NUMNODES to 512 * Change the range of NR_CPUS from 2-255 to 2-4096 and change the range of MAX_NUMNODES (NODES_SHIFT) from 1-32768 to 1-512. * Alter comment about how much each increment of NR_CPUS consumes. (This was found by configuring for 256 cpus and then 512 cpus and dividing the difference by 256.) Signed-off-by: Mike Travis <travis@sgi.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Jones <davej@codemonkey.org.uk> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae..bbafeac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -565,18 +565,18 @@ config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) config NR_CPUS - int "Maximum number of CPUs (2-255)" - range 2 255 + int "Maximum number of CPUs (2-4096)" + range 2 4096 depends on SMP default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 255 and the + kernel will support. The maximum supported value is 4096 and the minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds - approximately eight kilobytes to the kernel image. + approximately one kilobyte to the kernel image. config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -968,8 +968,8 @@ config NUMA_EMU number of nodes. This is only useful for debugging. config NODES_SHIFT - int "Max num nodes shift(1-15)" - range 1 15 if X86_64 + int "Max num nodes shift(1-9)" + range 1 9 if X86_64 default "6" if X86_64 default "4" if X86_NUMAQ default "3" -- cgit v1.1 From 40bd21740012132afb62d78ac3e6b82372b2fbc2 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..f71a76e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 0000000..2decba6 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include <linux/linkage.h> +#endif /* CONFIG_X86_64 */ +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#ifdef CONFIG_X86_64 +#include <linux/timex.h> +#endif /* CONFIG_X86_64 */ +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#ifdef CONFIG_X86_64 +#include <asm/acpi.h> +#endif /* CONFIG_X86_64 */ +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#ifndef CONFIG_X86_64 +#include <asm/timer.h> +#else /* CONFIG_X86_64 */ +#include <asm/hw_irq.h> +#endif /* CONFIG_X86_64 */ +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#ifndef CONFIG_X86_64 +#include <asm/arch_hooks.h> +#endif /* ! CONFIG_X86_64 */ +#include <asm/i8259.h> + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + unsigned long flags; + int ret; + + spin_lock_irqsave(&i8259A_lock, flags); + if (irq < 8) + ret = inb(PIC_MASTER_CMD) & mask; + else + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + "XT"); + enable_irq(irq); +} + +/* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + * This has to be protected by the irq controller spinlock + * before being called. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ + return value; + } + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967..d669142 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include <asm/arch_hooks.h> #include <asm/i8259.h> -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(PIC_MASTER_CMD) & mask; - else - ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - "XT"); - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ - value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ - return value; - } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ - value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index fa57a15..a95d2bd 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -87,315 +87,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(PIC_MASTER_CMD) & mask; - else - ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - "XT"); - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ - value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ - return value; - } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ - value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3..2f98df9 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.1 From 6b4d3afbe722ee71b8da346c2c4393938440c471 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:47:24 +0200 Subject: x86: i8259.c: remove #ifdefs around includes Remove #ifdefs around includes; including too much should be always safe. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2decba6..8b7eb0c 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -1,14 +1,10 @@ -#ifdef CONFIG_X86_64 #include <linux/linkage.h> -#endif /* CONFIG_X86_64 */ #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> -#ifdef CONFIG_X86_64 #include <linux/timex.h> -#endif /* CONFIG_X86_64 */ #include <linux/slab.h> #include <linux/random.h> #include <linux/init.h> @@ -16,24 +12,17 @@ #include <linux/sysdev.h> #include <linux/bitops.h> -#ifdef CONFIG_X86_64 #include <asm/acpi.h> -#endif /* CONFIG_X86_64 */ #include <asm/atomic.h> #include <asm/system.h> #include <asm/io.h> -#ifndef CONFIG_X86_64 #include <asm/timer.h> -#else /* CONFIG_X86_64 */ #include <asm/hw_irq.h> -#endif /* CONFIG_X86_64 */ #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> -#ifndef CONFIG_X86_64 #include <asm/arch_hooks.h> -#endif /* ! CONFIG_X86_64 */ #include <asm/i8259.h> /* -- cgit v1.1 From 4f6f3bac18c80ff6cf072d90796755c69cc4c748 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:52:52 +0200 Subject: x86: i8259.c: remove trivial ifdefs Remove #ifdefs where the only difference is formatting of comments. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8b7eb0c..433e49e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -175,24 +175,14 @@ handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ -#else /* CONFIG_X86_64 */ /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ -#else /* CONFIG_X86_64 */ - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -215,12 +205,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { -#ifndef CONFIG_X86_64 - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#else /* CONFIG_X86_64 */ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#endif /* CONFIG_X86_64 */ spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); -- cgit v1.1 From 4d9a6e6128dd8ada0d2feed2a9bb6506043e4655 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:57:52 +0200 Subject: x86: i8259: cleanup codingstyle Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 433e49e..7a0fda8 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<<irq; if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + outb(0x0B, PIC_MASTER_CMD); /* ISR register */ value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ + outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ return value; } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } -- cgit v1.1 From f20b11e716936f85850654c49e06a1f955b9e5fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to initirq_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to initirq_32/64.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ------------------------ arch/x86/kernel/i8259_64.c | 203 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 ++++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 203 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 318 insertions(+), 318 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76e..2a53ad2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d669142..0000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/timer.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/arch_hooks.h> -#include <asm/i8259.h> - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index a95d2bd..0000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,203 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/acpi.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 0000000..d669142 --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/timer.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/arch_hooks.h> +#include <asm/i8259.h> + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 0000000..a95d2bd --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,203 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/acpi.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/hw_irq.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/i8259.h> + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.1 From fce39665abb71d01d74ac74eb13dd69a799dfc2f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: make init_ISA_irqs() static Moved to i8259 branch to avoid conflicts. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index a95d2bd..64bc0f1 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -120,7 +120,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -void __init init_ISA_irqs (void) +static void __init init_ISA_irqs (void) { int i; -- cgit v1.1 From 21fd5132b223a10bdf17713dd0bf321cbd6471d2 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:44:02 +0200 Subject: x86: automatical unification of i8259.c Make conversion of i8259 very mechanical -- i8259 was generated by diff -D, with too different parts left in i8259_32 and i8259_64.c. Only "by hand" changes were removal of #ifdef from middle of the comment (prevented compilation) and removal of one static to allow splitting into files. Of course, it will need some cleanups now, and those will follow. Signed-of-by: Pavel Machek <pavel@suse.cz> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259.c | 368 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/i8259_32.c | 295 ------------------------------------ arch/x86/kernel/i8259_64.c | 309 ------------------------------------- include/asm-x86/i8259.h | 2 + 5 files changed, 371 insertions(+), 605 deletions(-) create mode 100644 arch/x86/kernel/i8259.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..f71a76e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 0000000..2decba6 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,368 @@ +#ifdef CONFIG_X86_64 +#include <linux/linkage.h> +#endif /* CONFIG_X86_64 */ +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#ifdef CONFIG_X86_64 +#include <linux/timex.h> +#endif /* CONFIG_X86_64 */ +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#ifdef CONFIG_X86_64 +#include <asm/acpi.h> +#endif /* CONFIG_X86_64 */ +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#ifndef CONFIG_X86_64 +#include <asm/timer.h> +#else /* CONFIG_X86_64 */ +#include <asm/hw_irq.h> +#endif /* CONFIG_X86_64 */ +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#ifndef CONFIG_X86_64 +#include <asm/arch_hooks.h> +#endif /* ! CONFIG_X86_64 */ +#include <asm/i8259.h> + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + unsigned long flags; + int ret; + + spin_lock_irqsave(&i8259A_lock, flags); + if (irq < 8) + ret = inb(PIC_MASTER_CMD) & mask; + else + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + "XT"); + enable_irq(irq); +} + +/* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + * This has to be protected by the irq controller spinlock + * before being called. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ + return value; + } + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); +#ifndef CONFIG_X86_64 + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ +#else /* CONFIG_X86_64 */ + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); +#endif /* CONFIG_X86_64 */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { +#ifndef CONFIG_X86_64 + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); +#else /* CONFIG_X86_64 */ + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); +#endif /* CONFIG_X86_64 */ + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + .name = "i8259", + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); +#endif /* CONFIG_X86_64 */ + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ +#ifndef CONFIG_X86_64 + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ +#else /* CONFIG_X86_64 */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + +#endif /* CONFIG_X86_64 */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index fe631967..d669142 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,302 +21,7 @@ #include <asm/arch_hooks.h> #include <asm/i8259.h> -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(PIC_MASTER_CMD) & mask; - else - ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - "XT"); - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ - value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ - return value; - } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ - value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index 1870e0e..b44095e 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -101,315 +101,6 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = #undef IRQ #undef IRQLIST_16 -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); -static void mask_and_ack_8259A(unsigned int); - -static struct irq_chip i8259A_chip = { - .name = "XT-PIC", - .mask = disable_8259A_irq, - .disable = disable_8259A_irq, - .unmask = enable_8259A_irq, - .mask_ack = mask_and_ack_8259A, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -unsigned int cached_irq_mask = 0xffff; - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_slave_mask, PIC_SLAVE_IMR); - else - outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(PIC_MASTER_CMD) & mask; - else - ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, - "XT"); - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ - value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ - return value; - } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ - value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecessarily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ - outb(cached_slave_mask, PIC_SLAVE_IMR); - /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); - /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); - } else { - inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ - outb(cached_master_mask, PIC_MASTER_IMR); - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG - "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(i8259A_auto_eoi); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ - outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - - /* - * outb_pic - this has to work on a wide range of PC hardware. - */ - outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); - /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); - if (auto_eoi) /* master does Auto EOI */ - outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ - outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - - outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); - /* 8259A-2 is a slave on master's IR2 */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); - /* (slave's support for AEOI in flat mode is to be investigated) */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); - - if (auto_eoi) - /* - * In AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ - outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h index 45d4df3..2f98df9 100644 --- a/include/asm-x86/i8259.h +++ b/include/asm-x86/i8259.h @@ -55,4 +55,6 @@ static inline void outb_pic(unsigned char value, unsigned int port) udelay(2); } +extern struct irq_chip i8259A_chip; + #endif /* __ASM_I8259_H__ */ -- cgit v1.1 From 15d613cb25efd978dd55592d011a6ffc487b3432 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:47:24 +0200 Subject: x86: i8259.c: remove #ifdefs around includes Remove #ifdefs around includes; including too much should be always safe. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2decba6..8b7eb0c 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -1,14 +1,10 @@ -#ifdef CONFIG_X86_64 #include <linux/linkage.h> -#endif /* CONFIG_X86_64 */ #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> -#ifdef CONFIG_X86_64 #include <linux/timex.h> -#endif /* CONFIG_X86_64 */ #include <linux/slab.h> #include <linux/random.h> #include <linux/init.h> @@ -16,24 +12,17 @@ #include <linux/sysdev.h> #include <linux/bitops.h> -#ifdef CONFIG_X86_64 #include <asm/acpi.h> -#endif /* CONFIG_X86_64 */ #include <asm/atomic.h> #include <asm/system.h> #include <asm/io.h> -#ifndef CONFIG_X86_64 #include <asm/timer.h> -#else /* CONFIG_X86_64 */ #include <asm/hw_irq.h> -#endif /* CONFIG_X86_64 */ #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> -#ifndef CONFIG_X86_64 #include <asm/arch_hooks.h> -#endif /* ! CONFIG_X86_64 */ #include <asm/i8259.h> /* -- cgit v1.1 From 3e8631d27088f394b6788829e238a60bf07d47ab Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:52:52 +0200 Subject: x86: i8259.c: remove trivial ifdefs Remove #ifdefs where the only difference is formatting of comments. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8b7eb0c..433e49e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -175,24 +175,14 @@ handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ -#else /* CONFIG_X86_64 */ /* 'Specific EOI' to slave */ - outb(0x60+(irq&7),PIC_SLAVE_CMD); + outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ - outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); -#ifndef CONFIG_X86_64 - outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ -#else /* CONFIG_X86_64 */ - /* 'Specific EOI' to master */ - outb(0x60+irq,PIC_MASTER_CMD); -#endif /* CONFIG_X86_64 */ + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -215,12 +205,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { -#ifndef CONFIG_X86_64 - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#else /* CONFIG_X86_64 */ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); -#endif /* CONFIG_X86_64 */ spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); -- cgit v1.1 From 680afbf989d697b9ba1382017a73c8cfa53b3f89 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 21 May 2008 11:57:52 +0200 Subject: x86: i8259: cleanup codingstyle Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: macro@ds2.pg.gda.pl Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/i8259.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 433e49e..7a0fda8 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -129,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<<irq; if (irq < 8) { - outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + outb(0x0B, PIC_MASTER_CMD); /* ISR register */ value = inb(PIC_MASTER_CMD) & irqmask; - outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ + outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ return value; } - outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); - outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } -- cgit v1.1 From d23b200a75f78e62744c3c25d078855eec8a689b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: make init_ISA_irqs() static Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/i8259_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index b44095e..31f49e8 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -134,7 +134,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -void __init init_ISA_irqs (void) +static void __init init_ISA_irqs (void) { int i; -- cgit v1.1 From ec42418f1973e1f77da1fcca3be59c0acb878a9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 24 May 2008 15:59:58 +0200 Subject: x86: rename the i8259_32/64.c leftovers to irqinit_32/64.c The leftovers of the i8259 unification have nothing to do with i8259 at all. They contain interrupt init code and the i8259_xx name is just misleading now. Rename them to irqinit_32/64.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/i8259_32.c | 114 ----------------------- arch/x86/kernel/i8259_64.c | 217 ------------------------------------------- arch/x86/kernel/irqinit_32.c | 114 +++++++++++++++++++++++ arch/x86/kernel/irqinit_64.c | 217 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 332 insertions(+), 332 deletions(-) delete mode 100644 arch/x86/kernel/i8259_32.c delete mode 100644 arch/x86/kernel/i8259_64.c create mode 100644 arch/x86/kernel/irqinit_32.c create mode 100644 arch/x86/kernel/irqinit_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f71a76e..2a53ad2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o i8259_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c deleted file mode 100644 index d669142..0000000 --- a/arch/x86/kernel/i8259_32.c +++ /dev/null @@ -1,114 +0,0 @@ -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/timer.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/arch_hooks.h> -#include <asm/i8259.h> - - - -/* - * Note that on a 486, we don't want to do a SIGFPE on an irq13 - * as the irq is unreliable, and exception 16 works correctly - * (ie as explained in the intel literature). On a 386, you - * can't use exception 16 due to bad IBM design, so we have to - * rely on the less exact irq13. - * - * Careful.. Not only is IRQ13 unreliable, but it is also - * leads to races. IBM designers who came up with it should - * be shot. - */ - - -static irqreturn_t math_error_irq(int cpl, void *dev_id) -{ - extern void math_error(void __user *); - outb(0,0xF0); - if (ignore_fpu_irq || !boot_cpu_data.hard_math) - return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); - return IRQ_HANDLED; -} - -/* - * New motherboards sometimes make IRQ 13 be a PCI interrupt, - * so allow interrupt sharing. - */ -static struct irqaction fpu_irq = { - .handler = math_error_irq, - .mask = CPU_MASK_NONE, - .name = "fpu", -}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < 16; i++) { - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); - } - - /* setup after call gates are initialised (usually add in - * the architecture specific gates) - */ - intr_init_hook(); - - /* - * External FPU? Set up irq13 if so, for - * original braindamaged IBM FERR coupling. - */ - if (boot_cpu_data.hard_math && !cpu_has_fpu) - setup_irq(FPU_IRQ, &fpu_irq); - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c deleted file mode 100644 index 31f49e8..0000000 --- a/arch/x86/kernel/i8259_64.c +++ /dev/null @@ -1,217 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/acpi.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -static void __init init_ISA_irqs (void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].chip = &no_irq_chip; - } - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); -#endif - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c new file mode 100644 index 0000000..d669142 --- /dev/null +++ b/arch/x86/kernel/irqinit_32.c @@ -0,0 +1,114 @@ +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/timer.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/arch_hooks.h> +#include <asm/i8259.h> + + + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->ip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + /* + * 16 old-style INTA-cycle interrupts: + */ + for (i = 0; i < 16; i++) { + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c new file mode 100644 index 0000000..31f49e8 --- /dev/null +++ b/arch/x86/kernel/irqinit_64.c @@ -0,0 +1,217 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/acpi.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/hw_irq.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/i8259.h> + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ + asmlinkage void IRQ_NAME(nr); \ + asm("\n.p2align\n" \ + "IRQ" #nr "_interrupt:\n\t" \ + "push $~(" #nr ") ; " \ + "jmp common_interrupt"); + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) + +#undef BUILD_16_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +/* for the irq vectors */ +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { + IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) +}; + +#undef IRQ +#undef IRQLIST_16 + + + + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + +static void __init init_ISA_irqs (void) +{ + int i; + + init_bsp_APIC(); + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + +#ifdef CONFIG_SMP + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPIs for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); +#endif + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); + + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} -- cgit v1.1 From 3711ccb07b7f0a13f4f1aa16a8fdca9c930f21ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 24 May 2008 17:24:34 +0200 Subject: x86: fixup the fallout of the bitops changes Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index 25d7105..895339d 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -245,7 +245,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); + set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); } #endif /* !__ASSEMBLY__ */ -- cgit v1.1 From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: move tracedata to RODATA .. allowing it to be write-protected just as other read-only data under CONFIG_DEBUG_RODATA. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/vmlinux_32.lds.S | 7 ------- arch/x86/kernel/vmlinux_64.lds.S | 7 ------- drivers/base/power/trace.c | 2 +- include/asm-generic/vmlinux.lds.h | 14 ++++++++++++++ include/asm-x86/resume-trace.h | 2 +- include/linux/resume-trace.h | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed08..2674f57 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -60,13 +60,6 @@ SECTIONS BUG_TABLE :text - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - RODATA /* writeable */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674..687041b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -53,13 +53,6 @@ SECTIONS RODATA - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 2b4b392..87a7f1d 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device); * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ -void generate_resume_trace(void *tracedata, unsigned int user) +void generate_resume_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778..f1992dc 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -93,6 +93,8 @@ VMLINUX_SYMBOL(__end_rio_route_ops) = .; \ } \ \ + TRACEDATA \ + \ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ @@ -318,6 +320,18 @@ __stop___bug_table = .; \ } +#ifdef CONFIG_PM_TRACE +#define TRACEDATA \ + . = ALIGN(4); \ + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { \ + __tracedata_start = .; \ + *(.tracedata) \ + __tracedata_end = .; \ + } +#else +#define TRACEDATA +#endif + #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_notes) = .; \ diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h index 2557514..8d9f0b4 100644 --- a/include/asm-x86/resume-trace.h +++ b/include/asm-x86/resume-trace.h @@ -6,7 +6,7 @@ #define TRACE_RESUME(user) \ do { \ if (pm_trace_enabled) { \ - void *tracedata; \ + const void *tracedata; \ asm volatile(_ASM_MOV_UL " $1f,%0\n" \ ".section .tracedata,\"a\"\n" \ "1:\t.word %c1\n\t" \ diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h index f3f4f28..c9ba2fd 100644 --- a/include/linux/resume-trace.h +++ b/include/linux/resume-trace.h @@ -8,7 +8,7 @@ extern int pm_trace_enabled; struct device; extern void set_trace_device(struct device *); -extern void generate_resume_trace(void *tracedata, unsigned int user); +extern void generate_resume_trace(const void *tracedata, unsigned int user); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ -- cgit v1.1 From a2eddfa95919a730e0e5ed17e9c303fe5ba249cd Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: make /proc/stat account for all interrupts LAPIC interrupts, which don't go through the generic interrupt handling code, aren't accounted for in /proc/stat. Hence this patch adds a mechanism architectures can use to accordingly adjust the statistics. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/irq_32.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_64.c | 28 ++++++++++++++++++++++++++++ fs/proc/proc_misc.c | 9 +++++++++ include/asm-x86/hardirq.h | 6 ++++++ 4 files changed, 81 insertions(+) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 147352d..468acd0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -313,16 +313,20 @@ skip: per_cpu(irq_stat,j).irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif +#ifdef CONFIG_X86_MCE seq_printf(p, "TRM: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "SPU: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); +#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); @@ -331,6 +335,40 @@ skip: return 0; } +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = nmi_count(cpu); + +#ifdef CONFIG_X86_LOCAL_APIC + sum += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += per_cpu(irq_stat, cpu).irq_resched_count; + sum += per_cpu(irq_stat, cpu).irq_call_count; + sum += per_cpu(irq_stat, cpu).irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(irq_stat, cpu).irq_thermal_count; +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += per_cpu(irq_stat, cpu).irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} + #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 3aac154..1f78b23 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -135,6 +135,7 @@ skip: seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif +#ifdef CONFIG_X86_MCE seq_printf(p, "TRM: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); @@ -143,6 +144,7 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); +#endif seq_printf(p, "SPU: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); @@ -153,6 +155,32 @@ skip: } /* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = cpu_pda(cpu)->__nmi_count; + + sum += cpu_pda(cpu)->apic_timer_irqs; +#ifdef CONFIG_SMP + sum += cpu_pda(cpu)->irq_resched_count; + sum += cpu_pda(cpu)->irq_call_count; + sum += cpu_pda(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += cpu_pda(cpu)->irq_thermal_count; + sum += cpu_pda(cpu)->irq_threshold_count; +#endif + sum += cpu_pda(cpu)->irq_spurious_count; + return sum; +} + +u64 arch_irq_stat(void) +{ + return atomic_read(&irq_err_count); +} + +/* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 74a323d..903e617 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -472,6 +472,13 @@ static const struct file_operations proc_vmalloc_operations = { }; #endif +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + static int show_stat(struct seq_file *p, void *v) { int i; @@ -509,7 +516,9 @@ static int show_stat(struct seq_file *p, void *v) sum += temp; per_irq_sum[j] += temp; } + sum += arch_irq_stat_cpu(i); } + sum += arch_irq_stat(); seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", (unsigned long long)cputime64_to_clock_t(user), diff --git a/include/asm-x86/hardirq.h b/include/asm-x86/hardirq.h index 314434d..000787d 100644 --- a/include/asm-x86/hardirq.h +++ b/include/asm-x86/hardirq.h @@ -3,3 +3,9 @@ #else # include "hardirq_64.h" #endif + +extern u64 arch_irq_stat_cpu(unsigned int cpu); +#define arch_irq_stat_cpu arch_irq_stat_cpu + +extern u64 arch_irq_stat(void); +#define arch_irq_stat arch_irq_stat -- cgit v1.1 From c2c14fb7afcc67b48724571506e095fccc65a670 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: tsc_64.c make constant UL arch/x86/kernel/tsc_64.c:245:13: warning: constant 0x100000000 is so big it is long Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tsc_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index fcc16e5..c19b0e2 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -242,7 +242,7 @@ void __init tsc_calibrate(void) if (hpet) { printk(KERN_INFO "TSC calibrated against HPET\n"); if (hpet2 < hpet1) - hpet2 += 0x100000000; + hpet2 += 0x100000000UL; hpet2 -= hpet1; tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; } else { -- cgit v1.1 From e6b0edef3453677b13e175a104a83eb36d062dd3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: clean up vdso_enabled type on x86_64 This fixes type of "vdso_enabled" on X86_64 to match extern in asm/elf.h. Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/vdso/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 3fdd514..19a6cfa 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -16,7 +16,7 @@ #include "vextern.h" /* Just for VMAGIC. */ #undef VEXTERN -int vdso_enabled = 1; +unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; -- cgit v1.1 From 46dd98a5c0b907f94742579ab605be1933a37abd Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Wed, 14 May 2008 16:10:39 -0700 Subject: arch/x86/mm/pat.c: use boot_cpu_has() arch/x86/mm/pat.c: In function 'phys_mem_access_prot_allowed': arch/x86/mm/pat.c:526: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:526: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:527: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:527: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:528: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:528: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type arch/x86/mm/pat.c:529: warning: passing argument 2 of 'constant_test_bit' from incompatible pointer type arch/x86/mm/pat.c:529: warning: passing argument 2 of 'variable_test_bit' from incompatible pointer type Don't open-code test_bit() on a __u32 Cc: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a1cbea0..e83b770 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -536,10 +536,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * we maintain the tradition of paranoia in this code. */ if (!pat_wc_enabled && - ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + ! ( boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && (pfn << PAGE_SHIFT) >= __pa(high_memory)) { flags = _PAGE_CACHE_UC; } -- cgit v1.1 From 5136dea5734cfddbc6d7ccb7ead85a3ac7ce3de2 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Wed, 14 May 2008 16:10:41 -0700 Subject: x86: bitops take an unsigned long * All (or most) other architectures do this. So should x86. Fix. Cc: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/bitops.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h index ee4b3ea..7d2494b 100644 --- a/include/asm-x86/bitops.h +++ b/include/asm-x86/bitops.h @@ -43,7 +43,7 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void set_bit(int nr, volatile void *addr) +static inline void set_bit(int nr, volatile unsigned long *addr) { asm volatile(LOCK_PREFIX "bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -57,7 +57,7 @@ static inline void set_bit(int nr, volatile void *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile void *addr) +static inline void __set_bit(int nr, volatile unsigned long *addr) { asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } @@ -72,7 +72,7 @@ static inline void __set_bit(int nr, volatile void *addr) * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void clear_bit(int nr, volatile void *addr) +static inline void clear_bit(int nr, volatile unsigned long *addr) { asm volatile(LOCK_PREFIX "btr %1,%0" : ADDR : "Ir" (nr)); } @@ -85,13 +85,13 @@ static inline void clear_bit(int nr, volatile void *addr) * clear_bit() is atomic and implies release semantics before the memory * operation. It can be used for an unlock. */ -static inline void clear_bit_unlock(unsigned nr, volatile void *addr) +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) { barrier(); clear_bit(nr, addr); } -static inline void __clear_bit(int nr, volatile void *addr) +static inline void __clear_bit(int nr, volatile unsigned long *addr) { asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); } @@ -108,7 +108,7 @@ static inline void __clear_bit(int nr, volatile void *addr) * No memory barrier is required here, because x86 cannot reorder stores past * older loads. Same principle as spin_unlock. */ -static inline void __clear_bit_unlock(unsigned nr, volatile void *addr) +static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) { barrier(); __clear_bit(nr, addr); @@ -126,7 +126,7 @@ static inline void __clear_bit_unlock(unsigned nr, volatile void *addr) * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile void *addr) +static inline void __change_bit(int nr, volatile unsigned long *addr) { asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); } @@ -140,7 +140,7 @@ static inline void __change_bit(int nr, volatile void *addr) * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(int nr, volatile void *addr) +static inline void change_bit(int nr, volatile unsigned long *addr) { asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); } @@ -153,7 +153,7 @@ static inline void change_bit(int nr, volatile void *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_set_bit(int nr, volatile void *addr) +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -170,7 +170,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr) * * This is the same as test_and_set_bit on x86. */ -static inline int test_and_set_bit_lock(int nr, volatile void *addr) +static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); } @@ -184,7 +184,7 @@ static inline int test_and_set_bit_lock(int nr, volatile void *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile void *addr) +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -203,7 +203,7 @@ static inline int __test_and_set_bit(int nr, volatile void *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_clear_bit(int nr, volatile void *addr) +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -223,7 +223,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_clear_bit(int nr, volatile void *addr) +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -235,7 +235,7 @@ static inline int __test_and_clear_bit(int nr, volatile void *addr) } /* WARNING: non atomic and it can be reordered! */ -static inline int __test_and_change_bit(int nr, volatile void *addr) +static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -255,7 +255,7 @@ static inline int __test_and_change_bit(int nr, volatile void *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int test_and_change_bit(int nr, volatile void *addr) +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) { int oldbit; @@ -266,13 +266,13 @@ static inline int test_and_change_bit(int nr, volatile void *addr) return oldbit; } -static inline int constant_test_bit(int nr, const volatile void *addr) +static inline int constant_test_bit(int nr, const volatile unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; } -static inline int variable_test_bit(int nr, volatile const void *addr) +static inline int variable_test_bit(int nr, volatile const unsigned long *addr) { int oldbit; -- cgit v1.1 From eef8f871d84b5df1a24902a4e4700188be1aff2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:34 +0200 Subject: x86: vsmp_64 add missing includes sparse mutters: arch/x86/kernel/vsmp_64.c:126:5: warning: symbol 'is_vsmp_box' was not declared. Should it be static? arch/x86/kernel/vsmp_64.c:145:13: warning: symbol 'vsmp_init' was not declared. Should it be static? Include the appropriate headers. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/vsmp_64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index ba8c0b7..0c029e8 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,9 +15,12 @@ #include <linux/init.h> #include <linux/pci_ids.h> #include <linux/pci_regs.h> + +#include <asm/apic.h> #include <asm/pci-direct.h> #include <asm/io.h> #include <asm/paravirt.h> +#include <asm/setup.h> #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* -- cgit v1.1 From 535694f361419ca195fd915dd5038c926334e1be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: boot/printfc use NULL instead 0 Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/boot/printf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index c1d00c02..50e47cd 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -56,7 +56,7 @@ static char *number(char *str, long num, int base, int size, int precision, if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) - return 0; + return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; if (type & SIGN) { -- cgit v1.1 From 635ee418381566f03819408e1303ef21fcf2d41c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: create prototype for (un)map_devmem Global functions need a prototype. Add it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/page.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index dc936dd..ed16509 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -51,8 +51,15 @@ #ifndef __ASSEMBLY__ +typedef struct { pgdval_t pgd; } pgd_t; +typedef struct { pgprotval_t pgprot; } pgprot_t; + extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); +extern void map_devmem(unsigned long pfn, unsigned long size, + pgprot_t vma_prot); +extern void unmap_devmem(unsigned long pfn, unsigned long size, + pgprot_t vma_prot); extern unsigned long max_pfn_mapped; @@ -74,9 +81,6 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE -typedef struct { pgdval_t pgd; } pgd_t; -typedef struct { pgprotval_t pgprot; } pgprot_t; - static inline pgd_t native_make_pgd(pgdval_t val) { return (pgd_t) { val }; -- cgit v1.1 From 0eafe234a2571f51ef9f7e8baddb58c828750771 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology add missing header k8_scan_nodes is global and needs a prototype. Add the header file which contains it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/k8topology_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 1f476e4..4847119 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -22,6 +22,7 @@ #include <asm/numa.h> #include <asm/mpspec.h> #include <asm/apic.h> +#include <asm/k8.h> static __init int find_northbridge(void) { -- cgit v1.1 From d34c08958fa36c7a8c3f8d9c0ebe6ec1ab744a68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology fix shadow variable sparse mutters: arch/x86/mm/k8topology_64.c:108:7: warning: symbol 'nodeid' shadows an earlier one Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/k8topology_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 4847119..36c6c4a 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -76,10 +76,10 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, nb; + int i, nb; unsigned char nodeids[8]; int found = 0; - u32 reg; + u32 nodeid, reg; unsigned numnodes; unsigned cores; unsigned bits; @@ -106,7 +106,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = 0; for (i = 0; i < 8; i++) { unsigned long base, limit; - u32 nodeid; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); -- cgit v1.1 From 55d4f22abc9f011c5db982f83377cdd38bb9a2a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: k8topology cleanup variable declarations Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/k8topology_64.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 36c6c4a..0ea66b5 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -74,17 +74,12 @@ static __init void early_get_boot_cpu_id(void) int __init k8_scan_nodes(unsigned long start, unsigned long end) { + unsigned numnodes, cores, bits, apicid_base; unsigned long prevbase; struct bootnode nodes[8]; - int i, nb; unsigned char nodeids[8]; - int found = 0; + int i, j, nb, found = 0; u32 nodeid, reg; - unsigned numnodes; - unsigned cores; - unsigned bits; - int j; - unsigned apicid_base; if (!early_pci_allowed()) return -1; -- cgit v1.1 From d1097635deed7196c2a859f285c29267779ca45a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 23:42:01 +0200 Subject: x86: move mmconfig declarations to header arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and the inline stubs to the appropriate header file. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/setup_64.c | 13 +------------ include/asm-x86/mmconfig.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 include/asm-x86/mmconfig.h diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index edc5fbf..fdfdc55 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/msr.h> #include <asm/acpi.h> +#include <asm/mmconfig.h> #include "../pci/pci.h" diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff128..341230d 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -71,6 +71,7 @@ #include <asm/topology.h> #include <asm/trampoline.h> #include <asm/pat.h> +#include <asm/mmconfig.h> #include <mach_apic.h> #ifdef CONFIG_PARAVIRT @@ -293,18 +294,6 @@ static void __init parse_setup_data(void) } } -#ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __init check_enable_amd_mmconf_dmi(void); -#else -void __cpuinit fam10h_check_enable_mmcfg(void) -{ -} -void __init check_enable_amd_mmconf_dmi(void) -{ -} -#endif - /* * setup_arch - architecture-specific boot-time initializations * diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h new file mode 100644 index 0000000..95beda0 --- /dev/null +++ b/include/asm-x86/mmconfig.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MMCONFIG_H +#define _ASM_MMCONFIG_H + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +static inline void fam10h_check_enable_mmcfg(void) { } +static inline void check_enable_amd_mmconf_dmi(void) { } +#endif + +#endif -- cgit v1.1 From 11034d55975be6d8b955a6eb11e87fc67ec086c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: init64.c include initrd.h free_initrd_mem needs a prototype, which is in linux/initrd.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b..1812309 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -18,6 +18,7 @@ #include <linux/swap.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/bootmem.h> #include <linux/proc_fs.h> -- cgit v1.1 From e0b32d768cad17af07af3aced67498bde98296c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: make command_line static in setup_64.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 341230d..78c1be6 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(edid_info); extern int root_mountflags; -char __initdata command_line[COMMAND_LINE_SIZE]; +static char __initdata command_line[COMMAND_LINE_SIZE]; static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, -- cgit v1.1 From 968cbfad1a727b5689ae620f4d970abf6ce73340 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: make __pci_mmcfg_init static in mmconfig-shared.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/pci/mmconfig-shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 0cfebec..23faaa8 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -374,7 +374,7 @@ reject: static int __initdata known_bridge; -void __init __pci_mmcfg_init(int early) +static void __init __pci_mmcfg_init(int early) { /* MMCONFIG disabled */ if ((pci_probe & PCI_PROBE_MMCONF) == 0) -- cgit v1.1 From 311f83494830fec17f086401a5b43984bb447cd2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk <bunk@kernel.org> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: kernel/pci-dma.c cleanups This patch contains the following cleanups: - make the following needlessly global code static: - dma_alloc_pages() Signed-off-by: Adrian Bunk <bunk@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c5ef1af..5bc29ca 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -357,7 +357,7 @@ int dma_supported(struct device *dev, u64 mask) EXPORT_SYMBOL(dma_supported); /* Allocate DMA memory on node near device */ -noinline struct page * +static noinline struct page * dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { int node; -- cgit v1.1 From cca7c0850fa0381639bf84dd06ee900e16346373 Mon Sep 17 00:00:00 2001 From: Jiri Slaby <jirislaby@gmail.com> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86_64: fix mm.txt documentation Commit 85eb69a16aab5a394ce043c2131319eae35e6493 introduced 512 MiB sized kernel and 1.5 GiB sized module space but omitted to change documentation properly. Fix that. [Wasn't the hole intentional protection hole?] Signed-off-by: Jiri Slaby <jirislaby@gmail.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: hpa@zytor.com Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- Documentation/x86_64/mm.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt index b89b6d2..efce750 100644 --- a/Documentation/x86_64/mm.txt +++ b/Documentation/x86_64/mm.txt @@ -11,9 +11,8 @@ ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... -ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0 -... unused hole ... -ffffffff88000000 - fffffffffff00000 (=1919 MB) module mapping space +ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 +ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space The direct mapping covers all memory in the system up to the highest memory address (this means in some cases it can also include PCI memory -- cgit v1.1 From 6a1673ae2201931ce34bf7ba9f90928c864d8334 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: make memory_add_physaddr_to_nid depend on MEMORY_HOTPLUG memory_add_physaddr_to_nid() is only used in the CONFIG_MEMORY_HOTPLUG_SPARSE || CONFIG_ACPI_HOTPLUG_MEMORY case. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/srat_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 3890234..f6d0584 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -522,6 +522,7 @@ int __node_distance(int a, int b) EXPORT_SYMBOL(__node_distance); +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) int memory_add_physaddr_to_nid(u64 start) { int i, ret = 0; @@ -533,4 +534,4 @@ int memory_add_physaddr_to_nid(u64 start) return ret; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - +#endif -- cgit v1.1 From 4e50e62ce52f39b5f45e82f68b18254458b429bb Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: eliminate duplicate consistency checks in init_32.c Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/mm/init_32.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10..4834c0f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -571,17 +571,6 @@ void __init mem_init(void) #endif bad_ppro = ppro_with_ram_bug(); -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR - "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, - FIXADDR_START); - BUG(); - } -#endif /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -614,7 +603,6 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); -#if 1 /* double-sanity-check paranoia */ printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM @@ -655,7 +643,6 @@ void __init mem_init(void) #endif BUG_ON(VMALLOC_START > VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); -#endif /* double-sanity-check paranoia */ if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); -- cgit v1.1 From 4c8ab98249fa3cead320ec0ee4cde9960b951989 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: i386: move FIX_ACPI_* into non-permanent range .. as they are used at early boot time only. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/fixmap_32.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h index 4b96148..f0df7ee 100644 --- a/include/asm-x86/fixmap_32.h +++ b/include/asm-x86/fixmap_32.h @@ -79,10 +79,6 @@ enum fixed_addresses { FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, #endif -#ifdef CONFIG_ACPI - FIX_ACPI_BEGIN, - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, -#endif #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -103,6 +99,10 @@ enum fixed_addresses { (__end_of_permanent_fixed_addresses & 511), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, FIX_WP_TEST, +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif -- cgit v1.1 From ebdd561a19d7917ac49fff9c355aa4833c504bf1 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: constify data in reboot.c Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/reboot.c | 18 +++++++++--------- arch/x86/kernel/reboot_fixups_32.c | 4 ++-- include/asm-x86/reboot.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f6be7d5..f8a6216 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -27,7 +27,7 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static long no_idt[3]; +static const struct desc_ptr no_idt = {}; static int reboot_mode; enum reboot_type reboot_type = BOOT_KBD; int reboot_force; @@ -201,15 +201,15 @@ core_initcall(reboot_init); controller to pulse the CPU reset line, which is more thorough, but doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ -static unsigned long long +static const unsigned long long real_mode_gdt_entries [3] = { 0x0000000000000000ULL, /* Null descriptor */ - 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ + 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ + 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct desc_ptr +static const struct desc_ptr real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, real_mode_idt = { 0x3ff, 0 }; @@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ -static unsigned char real_mode_switch [] = +static const unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ @@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] = 0x24, 0x10, /* f: andb $0x10,al */ 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ }; -static unsigned char jump_to_bios [] = +static const unsigned char jump_to_bios [] = { 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ }; @@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] = * specified by the code and length parameters. * We assume that length will aways be less that 100! */ -void machine_real_restart(unsigned char *code, int length) +void machine_real_restart(const unsigned char *code, int length) { local_irq_disable(); @@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void) } case BOOT_TRIPLE: - load_idt((const struct desc_ptr *)&no_idt); + load_idt(&no_idt); __asm__ __volatile__("int3"); reboot_type = BOOT_KBD; diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index dec0b5e..61a8377 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -49,7 +49,7 @@ struct device_fixup { void (*reboot_fixup)(struct pci_dev *); }; -static struct device_fixup fixups_table[] = { +static const struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, @@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = { */ void mach_reboot_fixups(void) { - struct device_fixup *cur; + const struct device_fixup *cur; struct pci_dev *dev; int i; diff --git a/include/asm-x86/reboot.h b/include/asm-x86/reboot.h index e63741f..206f355 100644 --- a/include/asm-x86/reboot.h +++ b/include/asm-x86/reboot.h @@ -14,8 +14,8 @@ struct machine_ops { extern struct machine_ops machine_ops; -void machine_real_restart(unsigned char *code, int length); void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); +void machine_real_restart(const unsigned char *code, int length); #endif /* _ASM_REBOOT_H */ -- cgit v1.1 From 9831bfb201a8e00415d0f9e5b5a50d902a90b2db Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86 - hide X86_VM_MASK from userland programs v3 X86_VM_MASK is kernel specific flags so hide it from userland programs. It should be defined *before* ptrace.h inclusion because of circular link between these files Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/vm86.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/asm-x86/vm86.h b/include/asm-x86/vm86.h index 074b357..cbf4a0e 100644 --- a/include/asm-x86/vm86.h +++ b/include/asm-x86/vm86.h @@ -14,12 +14,6 @@ #include <asm/processor-flags.h> -#ifdef CONFIG_VM86 -#define X86_VM_MASK X86_EFLAGS_VM -#else -#define X86_VM_MASK 0 /* No VM86 support */ -#endif - #define BIOSSEG 0x0f000 #define CPU_086 0 @@ -133,6 +127,13 @@ struct vm86plus_struct { }; #ifdef __KERNEL__ + +#ifdef CONFIG_VM86 +#define X86_VM_MASK X86_EFLAGS_VM +#else +#define X86_VM_MASK 0 /* No VM86 support */ +#endif + /* * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86 * mode - the main change is that the old segment descriptors aren't -- cgit v1.1 From 369101da7e3f6f971b7303922d2978b8483241c6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: head_64.S cleanup - use predefined flags from processor-flags.h We should better use already defined flags from processor-flags.h instead of defining own ones [>>> object code check >>>] original md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4 arch/x86/kernel/head_64.o text data bss dec hex filename 37361 4432 8192 49985 c341 arch/x86/kernel/head_64.o patched md5sum: 9cfa6dbf045a046bb5dfb85f8bcfe8c4 arch/x86/kernel/head_64.o text data bss dec hex filename 37361 4432 8192 49985 c341 arch/x86/kernel/head_64.o [<<< object code check <<<] Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 10a1955..2f59ca8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -18,6 +18,7 @@ #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> +#include <asm/processor-flags.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -184,14 +185,10 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_PM 1 /* protected mode */ -#define CR0_MP (1<<1) -#define CR0_ET (1<<4) -#define CR0_NE (1<<5) -#define CR0_WP (1<<16) -#define CR0_AM (1<<18) -#define CR0_PAGING (1<<31) - movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 -- cgit v1.1 From e83e31f41a623262714d9c56dde7dfc51ac078f7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Mon, 12 May 2008 15:43:39 +0200 Subject: x86: compressed/head_64.S cleanup - use predefined flags from processor-flags.h We should better use already defined flags from processor-flags.h instead of defining own ones [>>> object code check >>>] original md5sum: 129f24be6df396fb7d8bf998c01fc716 arch/x86/boot/compressed/head_64.o text data bss dec hex filename 705 48 45056 45809 b2f1 arch/x86/boot/compressed/head_64.o patched md5sum: 129f24be6df396fb7d8bf998c01fc716 arch/x86/boot/compressed/head_64-new.o text data bss dec hex filename 705 48 45056 45809 b2f1 arch/x86/boot/compressed/head_64-new.o [<<< object code check <<<] Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/boot/compressed/head_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d8819ef..1d5dff4 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -30,6 +30,7 @@ #include <asm/page.h> #include <asm/boot.h> #include <asm/msr.h> +#include <asm/processor-flags.h> #include <asm/asm-offsets.h> .section ".text.head" @@ -109,7 +110,7 @@ startup_32: /* Enable PAE mode */ xorl %eax, %eax - orl $(1 << 5), %eax + orl $(X86_CR4_PAE), %eax movl %eax, %cr4 /* @@ -170,7 +171,7 @@ startup_32: pushl %eax /* Enter paged protected Mode, activating Long Mode */ - movl $0x80000001, %eax /* Enable Paging and Protected mode */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ movl %eax, %cr0 /* Jump from 32bit compatibility mode into 64bit mode. */ -- cgit v1.1 From 2cc74111c78dbc7b26c41b4e87cfebfc3aed49c4 Mon Sep 17 00:00:00 2001 From: Huang Weiyi <weiyi.huang@gmail.com> Date: Sun, 11 May 2008 19:35:54 +0800 Subject: x86: ipi.c: removed duplicated include Removed duplicated include <linux/interrupt.h> in arch/x86/kernel/ipi.c. Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com> Cc: mingo@redhat.com Cc: hpa@zytor.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/ipi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index c0df7b8..9d98cda 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -8,7 +8,6 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/cache.h> -#include <linux/interrupt.h> #include <linux/cpu.h> #include <linux/module.h> -- cgit v1.1 From 883b7af932b4435eb4798cfa4fec0848639c2a87 Mon Sep 17 00:00:00 2001 From: Huang Weiyi <weiyi.huang@gmail.com> Date: Sun, 11 May 2008 19:36:57 +0800 Subject: x86: smpboot.c: removed duplicated include Removed duplicated include <asm/nmi.h> in arch/x86/kernel/smpboot.c. Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com> Cc: mingo@redhat.com Cc: hpa@zytor.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3898849..0974fc0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -59,7 +59,6 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> -#include <asm/nmi.h> #include <asm/vmi.h> #include <asm/genapic.h> #include <linux/mc146818rtc.h> -- cgit v1.1 From 2237ce2057b82561f7ea27ed30153571f404112d Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp> Date: Fri, 16 May 2008 11:01:26 +0900 Subject: x86: cleanup, remove duplicate declaration of unknown_nmi_panic Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/nmi.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1e36302..8455bf3 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -46,7 +46,6 @@ extern void nmi_watchdog_default(void); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -extern int unknown_nmi_panic; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); -- cgit v1.1 From 05139d8fb445d9a3569071af1c5920fc46191b7b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 13 May 2008 21:14:22 +0400 Subject: x86: head_64.S cleanup - use straight move to CR4 register Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2f59ca8..7475b4c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -155,9 +155,7 @@ ENTRY(secondary_startup_64) */ /* Enable PAE mode and PGE */ - xorq %rax, %rax - btsq $5, %rax - btsq $7, %rax + movl $(X86_CR4_PAE | X86_CR4_PGE), %eax movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ -- cgit v1.1 From 0e192b99d7d09f08b445c31ee7c7f3d0bfb27345 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 13 May 2008 20:55:40 +0400 Subject: x86: head_64.S cleanup - use PMD_SHIFT instead of numeric constant Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7475b4c..d8ed325 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -322,11 +322,11 @@ early_idt_ripmsg: ENTRY(name) /* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << 21) + (PERM) ; \ - i = i + 1 ; \ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ .endr /* -- cgit v1.1 From 23eb271b9186531e1eb704dda9ab37abcfd0ca4a Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Wed, 14 May 2008 16:10:39 -0700 Subject: x86: setup_force_cpu_cap(): don't do clear_bit(non-unsigned-long) Another hack to make proper prototyping of x86 bitops viable. Cc: Andrea Arcangeli <andrea@qumranet.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/cpufeature.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 0d609c8..78b47e7 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -142,11 +142,11 @@ extern const char * const x86_power_flags[32]; #define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) #define setup_clear_cpu_cap(bit) do { \ clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cleared_cpu_caps); \ } while (0) #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ - clear_bit(bit, cleared_cpu_caps); \ + clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -- cgit v1.1 From bfe4bb1526945e446d2912bef2e1e2cbd2c7349e Mon Sep 17 00:00:00 2001 From: Miklos Vajna <vmiklos@frugalware.org> Date: Sat, 17 May 2008 22:48:13 +0200 Subject: x86: janitor work in bugs.c Just moved trailing statements to the next line, removed space before open/close parenthesis, wrapped long lines. Signed-off-by: Miklos Vajna <vmiklos@frugalware.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/bugs.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 170d2f5..1b1c56b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -59,8 +59,12 @@ static void __init check_fpu(void) return; } -/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ - /* Test for the divl bug.. */ + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug.. + */ __asm__("fninit\n\t" "fldl %1\n\t" "fdivl %2\n\t" @@ -108,10 +112,15 @@ static void __init check_popad(void) "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) : "d" (inp) - : "ecx", "edi" ); - /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ - if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); + : "ecx", "edi"); + /* + * If this fails, it means that any user program may lock the + * CPU hard. Too bad. + */ + if (res != 12345678) + printk("Buggy.\n"); + else + printk("OK.\n"); #endif } @@ -137,7 +146,8 @@ static void __init check_config(void) * i486+ only features! (WP works in supervisor mode and the * new "invlpg" and "bswap" instructions) */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) +#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ + defined(CONFIG_X86_BSWAP) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif @@ -170,6 +180,7 @@ void __init check_bugs(void) check_fpu(); check_hlt(); check_popad(); - init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); } -- cgit v1.1 From f0766440dda7ace8a43b030f75e2dcb82449fb85 Mon Sep 17 00:00:00 2001 From: Christoph Lameter <clameter@sgi.com> Date: Fri, 9 May 2008 19:09:48 -0700 Subject: x86: unify current.h Simply stitch these together. There are just two definitions that are shared but the file is resonably small and putting these things together shows that further unifications requires a unification of the per cpu / pda handling between both arches. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/current.h | 42 ++++++++++++++++++++++++++++++++++++++---- include/asm-x86/current_32.h | 17 ----------------- include/asm-x86/current_64.h | 27 --------------------------- 3 files changed, 38 insertions(+), 48 deletions(-) delete mode 100644 include/asm-x86/current_32.h delete mode 100644 include/asm-x86/current_64.h diff --git a/include/asm-x86/current.h b/include/asm-x86/current.h index d2526d3..7515c19 100644 --- a/include/asm-x86/current.h +++ b/include/asm-x86/current.h @@ -1,5 +1,39 @@ +#ifndef _X86_CURRENT_H +#define _X86_CURRENT_H + #ifdef CONFIG_X86_32 -# include "current_32.h" -#else -# include "current_64.h" -#endif +#include <linux/compiler.h> +#include <asm/percpu.h> + +struct task_struct; + +DECLARE_PER_CPU(struct task_struct *, current_task); +static __always_inline struct task_struct *get_current(void) +{ + return x86_read_percpu(current_task); +} + +#else /* X86_32 */ + +#ifndef __ASSEMBLY__ +#include <asm/pda.h> + +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return read_pda(pcurrent); +} + +#else /* __ASSEMBLY__ */ + +#include <asm/asm-offsets.h> +#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg + +#endif /* __ASSEMBLY__ */ + +#endif /* X86_32 */ + +#define current get_current() + +#endif /* X86_CURRENT_H */ diff --git a/include/asm-x86/current_32.h b/include/asm-x86/current_32.h deleted file mode 100644 index 5af9bdb..0000000 --- a/include/asm-x86/current_32.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _I386_CURRENT_H -#define _I386_CURRENT_H - -#include <linux/compiler.h> -#include <asm/percpu.h> - -struct task_struct; - -DECLARE_PER_CPU(struct task_struct *, current_task); -static __always_inline struct task_struct *get_current(void) -{ - return x86_read_percpu(current_task); -} - -#define current get_current() - -#endif /* !(_I386_CURRENT_H) */ diff --git a/include/asm-x86/current_64.h b/include/asm-x86/current_64.h deleted file mode 100644 index 2d368ed..0000000 --- a/include/asm-x86/current_64.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _X86_64_CURRENT_H -#define _X86_64_CURRENT_H - -#if !defined(__ASSEMBLY__) -struct task_struct; - -#include <asm/pda.h> - -static inline struct task_struct *get_current(void) -{ - struct task_struct *t = read_pda(pcurrent); - return t; -} - -#define current get_current() - -#else - -#ifndef ASM_OFFSET_H -#include <asm/asm-offsets.h> -#endif - -#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg - -#endif - -#endif /* !(_X86_64_CURRENT_H) */ -- cgit v1.1 From 2141261a11524acbfce86a5a213bbaea08794bbb Mon Sep 17 00:00:00 2001 From: Miklos Vajna <vmiklos@frugalware.org> Date: Tue, 13 May 2008 13:35:25 +0200 Subject: x86: janitor work in video-vga.c Just moved an open brace to the previous line. Signed-off-by: Miklos Vajna <vmiklos@frugalware.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/boot/video-vga.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 40ecb8d..b939cb4 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -259,8 +259,7 @@ static int vga_probe(void) return mode_count[adapter]; } -__videocard video_vga = -{ +__videocard video_vga = { .card_name = "VGA", .probe = vga_probe, .set_mode = vga_set_mode, -- cgit v1.1 From 78d64fc21d2aa425c65de49765cddecc84d595a4 Mon Sep 17 00:00:00 2001 From: Joe Perches <joe@perches.com> Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: include/asm-x86/string_32.h - style only Looked at this file because of __memcpy warnings. Thought it could use a style/checkpatch cleanup. No change in vmlinux. [tglx: fixed the remaining issues ] Signed-off-by: Joe Perches <joe@perches.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/string_32.h | 316 +++++++++++++++++++++++++------------------- 1 file changed, 181 insertions(+), 135 deletions(-) diff --git a/include/asm-x86/string_32.h b/include/asm-x86/string_32.h index b49369a..8d0c593 100644 --- a/include/asm-x86/string_32.h +++ b/include/asm-x86/string_32.h @@ -29,81 +29,116 @@ extern char *strchr(const char *s, int c); #define __HAVE_ARCH_STRLEN extern size_t strlen(const char *s); -static __always_inline void * __memcpy(void * to, const void * from, size_t n) +static __always_inline void *__memcpy(void *to, const void *from, size_t n) { -int d0, d1, d2; -__asm__ __volatile__( - "rep ; movsl\n\t" - "movl %4,%%ecx\n\t" - "andl $3,%%ecx\n\t" - "jz 1f\n\t" - "rep ; movsb\n\t" - "1:" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) - : "memory"); -return (to); + int d0, d1, d2; + asm volatile("rep ; movsl\n\t" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" + "jz 1f\n\t" + "rep ; movsb\n\t" + "1:" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from) + : "memory"); + return to; } /* * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ -static __always_inline void * __constant_memcpy(void * to, const void * from, size_t n) +static __always_inline void *__constant_memcpy(void *to, const void *from, + size_t n) { long esi, edi; - if (!n) return to; -#if 1 /* want to do small copies with non-string ops? */ + if (!n) + return to; + switch (n) { - case 1: *(char*)to = *(char*)from; return to; - case 2: *(short*)to = *(short*)from; return to; - case 4: *(int*)to = *(int*)from; return to; -#if 1 /* including those doable with two moves? */ - case 3: *(short*)to = *(short*)from; - *((char*)to+2) = *((char*)from+2); return to; - case 5: *(int*)to = *(int*)from; - *((char*)to+4) = *((char*)from+4); return to; - case 6: *(int*)to = *(int*)from; - *((short*)to+2) = *((short*)from+2); return to; - case 8: *(int*)to = *(int*)from; - *((int*)to+1) = *((int*)from+1); return to; -#endif + case 1: + *(char *)to = *(char *)from; + return to; + case 2: + *(short *)to = *(short *)from; + return to; + case 4: + *(int *)to = *(int *)from; + return to; + + case 3: + *(short *)to = *(short *)from; + *((char *)to + 2) = *((char *)from + 2); + return to; + case 5: + *(int *)to = *(int *)from; + *((char *)to + 4) = *((char *)from + 4); + return to; + case 6: + *(int *)to = *(int *)from; + *((short *)to + 2) = *((short *)from + 2); + return to; + case 8: + *(int *)to = *(int *)from; + *((int *)to + 1) = *((int *)from + 1); + return to; } -#endif - esi = (long) from; - edi = (long) to; - if (n >= 5*4) { + + esi = (long)from; + edi = (long)to; + if (n >= 5 * 4) { /* large block: use rep prefix */ int ecx; - __asm__ __volatile__( - "rep ; movsl" - : "=&c" (ecx), "=&D" (edi), "=&S" (esi) - : "0" (n/4), "1" (edi),"2" (esi) - : "memory" + asm volatile("rep ; movsl" + : "=&c" (ecx), "=&D" (edi), "=&S" (esi) + : "0" (n / 4), "1" (edi), "2" (esi) + : "memory" ); } else { /* small block: don't clobber ecx + smaller code */ - if (n >= 4*4) __asm__ __volatile__("movsl" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - if (n >= 3*4) __asm__ __volatile__("movsl" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - if (n >= 2*4) __asm__ __volatile__("movsl" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - if (n >= 1*4) __asm__ __volatile__("movsl" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 4 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 3 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 2 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + if (n >= 1 * 4) + asm volatile("movsl" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); } switch (n % 4) { /* tail */ - case 0: return to; - case 1: __asm__ __volatile__("movsb" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - return to; - case 2: __asm__ __volatile__("movsw" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - return to; - default: __asm__ __volatile__("movsw\n\tmovsb" - :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); - return to; + case 0: + return to; + case 1: + asm volatile("movsb" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; + case 2: + asm volatile("movsw" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; + default: + asm volatile("movsw\n\tmovsb" + : "=&D"(edi), "=&S"(esi) + : "0"(edi), "1"(esi) + : "memory"); + return to; } } @@ -117,87 +152,86 @@ static __always_inline void * __constant_memcpy(void * to, const void * from, si * This CPU favours 3DNow strongly (eg AMD Athlon) */ -static inline void * __constant_memcpy3d(void * to, const void * from, size_t len) +static inline void *__constant_memcpy3d(void *to, const void *from, size_t len) { if (len < 512) return __constant_memcpy(to, from, len); return _mmx_memcpy(to, from, len); } -static __inline__ void *__memcpy3d(void *to, const void *from, size_t len) +static inline void *__memcpy3d(void *to, const void *from, size_t len) { if (len < 512) return __memcpy(to, from, len); return _mmx_memcpy(to, from, len); } -#define memcpy(t, f, n) \ -(__builtin_constant_p(n) ? \ - __constant_memcpy3d((t),(f),(n)) : \ - __memcpy3d((t),(f),(n))) +#define memcpy(t, f, n) \ + (__builtin_constant_p((n)) \ + ? __constant_memcpy3d((t), (f), (n)) \ + : __memcpy3d((t), (f), (n))) #else /* * No 3D Now! */ - -#define memcpy(t, f, n) \ -(__builtin_constant_p(n) ? \ - __constant_memcpy((t),(f),(n)) : \ - __memcpy((t),(f),(n))) + +#define memcpy(t, f, n) \ + (__builtin_constant_p((n)) \ + ? __constant_memcpy((t), (f), (n)) \ + : __memcpy((t), (f), (n))) #endif #define __HAVE_ARCH_MEMMOVE -void *memmove(void * dest,const void * src, size_t n); +void *memmove(void *dest, const void *src, size_t n); #define memcmp __builtin_memcmp #define __HAVE_ARCH_MEMCHR -extern void *memchr(const void * cs,int c,size_t count); +extern void *memchr(const void *cs, int c, size_t count); -static inline void * __memset_generic(void * s, char c,size_t count) +static inline void *__memset_generic(void *s, char c, size_t count) { -int d0, d1; -__asm__ __volatile__( - "rep\n\t" - "stosb" - : "=&c" (d0), "=&D" (d1) - :"a" (c),"1" (s),"0" (count) - :"memory"); -return s; + int d0, d1; + asm volatile("rep\n\t" + "stosb" + : "=&c" (d0), "=&D" (d1) + : "a" (c), "1" (s), "0" (count) + : "memory"); + return s; } /* we might want to write optimized versions of these later */ -#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count)) +#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count)) /* - * memset(x,0,y) is a reasonably common thing to do, so we want to fill + * memset(x, 0, y) is a reasonably common thing to do, so we want to fill * things 32 bits at a time even when we don't know the size of the * area at compile-time.. */ -static __always_inline void * __constant_c_memset(void * s, unsigned long c, size_t count) +static __always_inline +void *__constant_c_memset(void *s, unsigned long c, size_t count) { -int d0, d1; -__asm__ __volatile__( - "rep ; stosl\n\t" - "testb $2,%b3\n\t" - "je 1f\n\t" - "stosw\n" - "1:\ttestb $1,%b3\n\t" - "je 2f\n\t" - "stosb\n" - "2:" - :"=&c" (d0), "=&D" (d1) - :"a" (c), "q" (count), "0" (count/4), "1" ((long) s) - :"memory"); -return (s); + int d0, d1; + asm volatile("rep ; stosl\n\t" + "testb $2,%b3\n\t" + "je 1f\n\t" + "stosw\n" + "1:\ttestb $1,%b3\n\t" + "je 2f\n\t" + "stosb\n" + "2:" + : "=&c" (d0), "=&D" (d1) + : "a" (c), "q" (count), "0" (count/4), "1" ((long)s) + : "memory"); + return s; } /* Added by Gertjan van Wingerde to make minix and sysv module work */ #define __HAVE_ARCH_STRNLEN -extern size_t strnlen(const char * s, size_t count); +extern size_t strnlen(const char *s, size_t count); /* end of additional stuff */ #define __HAVE_ARCH_STRSTR @@ -207,66 +241,78 @@ extern char *strstr(const char *cs, const char *ct); * This looks horribly ugly, but the compiler can optimize it totally, * as we by now know that both pattern and count is constant.. */ -static __always_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count) +static __always_inline +void *__constant_c_and_count_memset(void *s, unsigned long pattern, + size_t count) { switch (count) { + case 0: + return s; + case 1: + *(unsigned char *)s = pattern & 0xff; + return s; + case 2: + *(unsigned short *)s = pattern & 0xffff; + return s; + case 3: + *(unsigned short *)s = pattern & 0xffff; + *((unsigned char *)s + 2) = pattern & 0xff; + return s; + case 4: + *(unsigned long *)s = pattern; + return s; + } + +#define COMMON(x) \ + asm volatile("rep ; stosl" \ + x \ + : "=&c" (d0), "=&D" (d1) \ + : "a" (pattern), "0" (count/4), "1" ((long)s) \ + : "memory") + + { + int d0, d1; + switch (count % 4) { case 0: + COMMON(""); return s; case 1: - *(unsigned char *)s = pattern & 0xff; + COMMON("\n\tstosb"); return s; case 2: - *(unsigned short *)s = pattern & 0xffff; + COMMON("\n\tstosw"); return s; - case 3: - *(unsigned short *)s = pattern & 0xffff; - *(2+(unsigned char *)s) = pattern & 0xff; + default: + COMMON("\n\tstosw\n\tstosb"); return s; - case 4: - *(unsigned long *)s = pattern; - return s; - } -#define COMMON(x) \ -__asm__ __volatile__( \ - "rep ; stosl" \ - x \ - : "=&c" (d0), "=&D" (d1) \ - : "a" (pattern),"0" (count/4),"1" ((long) s) \ - : "memory") -{ - int d0, d1; - switch (count % 4) { - case 0: COMMON(""); return s; - case 1: COMMON("\n\tstosb"); return s; - case 2: COMMON("\n\tstosw"); return s; - default: COMMON("\n\tstosw\n\tstosb"); return s; + } } -} - + #undef COMMON } -#define __constant_c_x_memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_c_and_count_memset((s),(c),(count)) : \ - __constant_c_memset((s),(c),(count))) +#define __constant_c_x_memset(s, c, count) \ + (__builtin_constant_p(count) \ + ? __constant_c_and_count_memset((s), (c), (count)) \ + : __constant_c_memset((s), (c), (count))) -#define __memset(s, c, count) \ -(__builtin_constant_p(count) ? \ - __constant_count_memset((s),(c),(count)) : \ - __memset_generic((s),(c),(count))) +#define __memset(s, c, count) \ + (__builtin_constant_p(count) \ + ? __constant_count_memset((s), (c), (count)) \ + : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET -#define memset(s, c, count) \ -(__builtin_constant_p(c) ? \ - __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \ - __memset((s),(c),(count))) +#define memset(s, c, count) \ + (__builtin_constant_p(c) \ + ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ + (count)) \ + : __memset((s), (c), (count))) /* * find the first occurrence of byte 'c', or 1 past the area if none */ #define __HAVE_ARCH_MEMSCAN -extern void *memscan(void * addr, int c, size_t size); +extern void *memscan(void *addr, int c, size_t size); #endif /* __KERNEL__ */ -- cgit v1.1 From 83cd1daa1dc3400e5ca2255818641233ebb30680 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: eliminate dead code in x86_64 entry.S Remove the not longer used handlers for reserved vectors. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/entry_64.S | 4 ---- arch/x86/kernel/traps_64.c | 3 --- 2 files changed, 7 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df..1f1751d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1120,10 +1120,6 @@ ENTRY(coprocessor_segment_overrun) zeroentry do_coprocessor_segment_overrun END(coprocessor_segment_overrun) -ENTRY(reserved) - zeroentry do_reserved -END(reserved) - /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index adff76e..ec6d3b2 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -71,7 +71,6 @@ asmlinkage void general_protection(void); asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); -asmlinkage void reserved(void); asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); @@ -702,12 +701,10 @@ DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGSEGV, "reserved", reserved) /* Runs on IST stack */ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) -- cgit v1.1 From 48e239572271cf79412d90753a721242a765f7d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 24 May 2008 17:24:34 +0200 Subject: x86: fixup the fallout of the bitops changes Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5015976..ee1d6d3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -255,7 +255,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - &ptep->pte); + (unsigned long *) &ptep->pte); if (ret) pte_update(vma->vm_mm, addr, ptep); -- cgit v1.1 From c9fea78dc9775981bed2da379568c8a8fddd1fb6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 19 May 2008 13:53:39 +0200 Subject: x86: make NUMAQ depend on PCI NUMAQ depends on the existence of PCI hardware, and requiring PCI makes this subarch simpler and avoids build failures if PCI is disabled. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bbafeac..0293728 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -270,7 +270,7 @@ config X86_VOYAGER config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" - depends on SMP && X86_32 + depends on SMP && X86_32 && PCI select NUMA help This option is used for getting Linux to run on a (IBM/Sequent) NUMA -- cgit v1.1 From 1ac97018169c5a13feaa90d9671f2d6ba2d9e86e Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 19 May 2008 14:10:14 +0200 Subject: x86: untangle pci dependencies make PCI-less subarches not build with PCI - instead of complicating the PCI dependencies. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0293728..78700f5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -258,7 +258,7 @@ config X86_ELAN config X86_VOYAGER bool "Voyager (NCR)" - depends on X86_32 && (SMP || BROKEN) + depends on X86_32 && (SMP || BROKEN) && !PCI help Voyager is an MCA-based 32-way capable SMP architecture proprietary to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. @@ -300,7 +300,7 @@ config X86_BIGSMP config X86_VISWS bool "SGI 320/540 (Visual Workstation)" - depends on X86_32 + depends on X86_32 && !PCI help The SGI Visual Workstation series is an IA32-based workstation based on SGI systems chips with some legacy PC hardware attached. @@ -344,7 +344,7 @@ config X86_RDC321X config X86_VSMP bool "Support for ScaleMP vSMP" select PARAVIRT - depends on X86_64 + depends on X86_64 && !PCI help Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option @@ -1477,8 +1477,7 @@ endmenu menu "Bus options (PCI etc.)" config PCI - bool "PCI support" if !X86_VISWS && !X86_VSMP - depends on !X86_VOYAGER + bool "PCI support" default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) help -- cgit v1.1 From 0da72a4aeb4482c64c1142a2e36b556d13374937 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Wed, 30 Apr 2008 20:11:51 +0200 Subject: x86: fix sparse warning in mtrr/generic.c arch/x86/kernel/cpu/mtrr/generic.c:216:12: warning: symbol 'lo' shadows an earlier one Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5d241ce..0625d41 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -213,12 +213,12 @@ void __init get_mtrr_state(void) mtrr_state.enabled = (lo & 0xc00) >> 10; if (amd_special_default_mtrr()) { - unsigned lo, hi; + unsigned low, high; /* TOP_MEM2 */ - rdmsr(MSR_K8_TOP_MEM2, lo, hi); - tom2 = hi; + rdmsr(MSR_K8_TOP_MEM2, low, high); + tom2 = high; tom2 <<= 32; - tom2 |= lo; + tom2 |= low; tom2 &= 0xffffff8000000ULL; } if (mtrr_show) { -- cgit v1.1 From 0dbfafa5fcd4dd189e2adc7b6ed9e0405e846d79 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 23 Apr 2008 15:09:05 +0200 Subject: x86: move i386 memory setup code to e820_32.c The x86_64 code has centralized the memory setup code in e820_64.c. This patch copies that approach to i386: - early_param("mem", ...) parsing is moved from setup_32.c to e820_32.c. - setup_memory_map() and finish_e820_parsing() are factored out from setup_arch(), and declarations are added to e820_32.h. - print_memory_map() is made static and removed from e820_32.h. - user_defined_memmap is marked as __initdata. Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/e820_32.c | 60 ++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/setup_32.c | 50 ++------------------------------------ include/asm-x86/e820_32.h | 4 +++- 3 files changed, 63 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index ed733e7..31ea2bb 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -30,7 +30,6 @@ unsigned long pci_mem_start = 0x10000000; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif -extern int user_defined_memmap; static struct resource system_rom_resource = { .name = "System ROM", @@ -584,7 +583,7 @@ void __init e820_register_memory(void) pci_mem_start, gapstart, gapsize); } -void __init print_memory_map(char *who) +static void __init print_memory_map(char *who) { int i; @@ -692,6 +691,54 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type) return 0; } +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __init __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + +void __init setup_memory_map(void) +{ + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); +} + +static int __initdata user_defined_memmap; + +/* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ +static int __init parse_mem(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "nopentium") == 0) { + setup_clear_cpu_cap(X86_FEATURE_PSE); + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); + static int __init parse_memmap(char *arg) { if (!arg) @@ -762,6 +809,15 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, new_type); } } + +void __init finish_e820_parsing(void) +{ + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } +} + void __init update_e820(void) { u8 nr_map; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2c5f8b2..b54c79c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -237,42 +237,6 @@ static inline void copy_edd(void) } #endif -int __initdata user_defined_memmap; - -/* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to <mem>, overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * <start> to <start>+<mem>, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] - */ -static int __init parse_mem(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "nopentium") == 0) { - setup_clear_cpu_cap(X86_FEATURE_PSE); - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(arg, &arg); - limit_regions(mem_size); - user_defined_memmap = 1; - } - return 0; -} -early_param("mem", parse_mem); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -725,12 +689,6 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - #ifdef CONFIG_NUMA /* * In the golden day, when everything among i386 and x86_64 will be @@ -786,8 +744,7 @@ void __init setup_arch(char **cmdline_p) #endif ARCH_SETUP - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); + setup_memory_map(); copy_edd(); @@ -807,10 +764,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (user_defined_memmap) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + finish_e820_parsing(); strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index a9f7c6e..e1f10c6 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -18,6 +18,9 @@ #ifndef __ASSEMBLY__ +extern void setup_memory_map(void); +extern void finish_e820_parsing(void); + extern struct e820map e820; extern void update_e820(void); @@ -32,7 +35,6 @@ extern void update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); -extern void print_memory_map(char *who); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); -- cgit v1.1 From 95ffa2438d0e9c48779f0106b1c0eb36165e759c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Tue, 29 Apr 2008 03:52:33 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout, v8 some BIOS like to use continus MTRR layout, and X driver can not add WB entries for graphical cards when 4g or more RAM installed. the patch will change MTRR to discrete. mtrr_chunk_size= could be used to have smaller continuous block to hold holes. default is 256m, could be set according to size of graphics card memory. mtrr_gran_size= could be used to send smallest mtrr block to avoid run out of MTRRs v2: fix -1 for UC checking v3: default to disable, and need use enable_mtrr_cleanup to enable this feature skip the var state change warning. remove next_basek in range_to_mtrr() v4: correct warning mask. v5: CONFIG_MTRR_SANITIZER v6: fix 1g, 2g, 512 aligment with extra hole v7: gran_sizek to prevent running out of MTRRs. v8: fix hole_basek caculation caused when removing next_basek gran_sizek using when basek is 0. need to apply [PATCH] x86: fix trimming e820 with MTRR holes. right after this one. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- Documentation/kernel-parameters.txt | 14 ++ arch/x86/Kconfig | 26 ++ arch/x86/kernel/cpu/mtrr/generic.c | 32 ++- arch/x86/kernel/cpu/mtrr/main.c | 467 +++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/mtrr/mtrr.h | 3 + 5 files changed, 528 insertions(+), 14 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e07c432..4442760 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -599,6 +599,20 @@ and is between 256 and 4096 characters. It is defined in the file See drivers/char/README.epca and Documentation/digiepca.txt. + disable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86] + The kernel tries to adjust MTRR layout from continuous + to discrete, to make X server driver able to add WB + entry later. This parameter enables/disables that. + + mtrr_chunk_size=nn[KMG] [X86] + used for mtrr cleanup. It is largest continous chunk + that could hold holes aka. UC entries. + + mtrr_gran_size=nn[KMG] [X86] + used for mtrr cleanup. It is granity of mtrr block. + Big value could prevent small alignment use up MTRRs. + disable_mtrr_trim [X86, Intel and AMD only] By default the kernel will trim any uncacheable memory out of your available memory pool based on diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae..97a1764 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1092,6 +1092,32 @@ config MTRR See <file:Documentation/mtrr.txt> for more information. +config MTRR_SANITIZER + def_bool y + prompt "MTRR cleanup support" + depends on MTRR + help + Convert MTRR layout from continuous to discrete, so some X driver + could add WB entries. + + Say N here if you see bootup problems (boot crash, boot hang, + spontaneous reboots). + + Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size + could be used to send largest mtrr entry size for continuous block + to hold holes (aka. UC entries) + + If unsure, say Y. + +config MTRR_SANITIZER_ENABLE_DEFAULT + def_bool y + prompt "Enable MTRR cleanup by default" + depends on MTRR_SANITIZER + help + Enable mtrr cleanup by default + + If unsure, say Y. + config X86_PAT bool prompt "x86 PAT support" diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0625d41..5aae648 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = { static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; -static u64 tom2; +u64 mtrr_tom2; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." @@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) } } - if (tom2) { - if (start >= (1ULL<<32) && (end < tom2)) + if (mtrr_tom2) { + if (start >= (1ULL<<32) && (end < mtrr_tom2)) return MTRR_TYPE_WRBACK; } @@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } +/* fill the MSR pair relating to a var range */ +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) +{ + struct mtrr_var_range *vr; + + vr = mtrr_state.var_ranges; + + vr[index].base_lo = base_lo; + vr[index].base_hi = base_hi; + vr[index].mask_lo = mask_lo; + vr[index].mask_hi = mask_hi; +} + static void get_fixed_ranges(mtrr_type * frs) { @@ -216,10 +230,10 @@ void __init get_mtrr_state(void) unsigned low, high; /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); - tom2 = high; - tom2 <<= 32; - tom2 |= low; - tom2 &= 0xffffff8000000ULL; + mtrr_tom2 = high; + mtrr_tom2 <<= 32; + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff8000000ULL; } if (mtrr_show) { int high_width; @@ -251,9 +265,9 @@ void __init get_mtrr_state(void) else printk(KERN_INFO "MTRR %u disabled\n", i); } - if (tom2) { + if (mtrr_tom2) { printk(KERN_INFO "TOM2: %016llx aka %lldM\n", - tom2, tom2>>20); + mtrr_tom2, mtrr_tom2>>20); } } mtrr_state_set = 1; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6a1e278..8a6f68b 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -37,6 +37,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/sort.h> #include <asm/e820.h> #include <asm/mtrr.h> @@ -609,6 +610,452 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; +#ifdef CONFIG_MTRR_SANITIZER + +#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT +static int enable_mtrr_cleanup __initdata = 1; +#else +static int enable_mtrr_cleanup __initdata; +#endif + +#else + +static int enable_mtrr_cleanup __initdata = -1; + +#endif + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); + +#define RANGE_NUM 256 + +struct res_range { + unsigned long start; + unsigned long end; +}; + +static int __init add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end, int merge) +{ + int i; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < nr_range; i++) { + unsigned long final_start, final_end; + unsigned long common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + +addit: + /* need to add that */ + if (nr_range >= RANGE_NUM) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; + +} +static void __init subtract_range(struct res_range *range, unsigned long start, + unsigned long end) +{ + int i; + int j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int __init cmp_range(const void *x1, const void *x2) +{ + const struct res_range *r1 = x1; + const struct res_range *r2 = x2; + long start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +struct var_mtrr_state { + unsigned long range_startk, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; + unsigned address_bits; +}; + +static void __init set_var_mtrr( + unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + unsigned address_mask_high; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + address_mask_high = ((1u << (address_bits - 32u)) - 1u); + + base_hi = basek >> 22; + base_lo = basek << 10; + + if (sizek < 4*1024*1024) { + mask_hi = address_mask_high; + mask_lo = ~((sizek << 10) - 1); + } else { + mask_hi = address_mask_high & (~((sizek >> 22) - 1)); + mask_lo = 0; + } + + base_lo |= type; + mask_lo |= 0x800; + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static unsigned int __init range_to_mtrr(unsigned int reg, + unsigned long range_startk, unsigned long range_sizek, + unsigned char type, unsigned address_bits) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + /* Compute the maximum size I can make a range */ + if (range_startk) + max_align = ffs(range_startk) - 1; + else + max_align = 32; + align = fls(range_sizek) - 1; + if (align > max_align) + align = max_align; + + sizek = 1 << align; + printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n", + reg, range_startk >> 10, sizek >> 10, + (type == MTRR_TYPE_UNCACHABLE)?"UC": + ((type == MTRR_TYPE_WRBACK)?"WB":"Other") + ); + set_var_mtrr(reg++, range_startk, sizek, type, address_bits); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* align with gran size, prevent small block used up MTRRs */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return; + range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek); + + while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) { + range_sizek -= gran_sizek; + if (!range_sizek) + return; + } + state->range_startk = range_basek; + state->range_sizek = range_sizek; + + /* try to append some small hole */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + if ((range0_sizek == state->range_sizek) || + ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) { + printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + return; + } + + + range0_sizek -= chunk_sizek; + printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); + + range_basek = range0_basek + range0_sizek; + range_sizek = chunk_sizek; + if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) { + hole_sizek = range_sizek - (state->range_sizek - range0_sizek); + hole_basek = range_basek + range_sizek - hole_sizek; + } else + range_sizek = state->range_sizek - range0_sizek; + + printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + if (hole_sizek) { + printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits); + } +} + +static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) +{ + unsigned long basek, sizek; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range */ + if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs */ + if (state->range_sizek != 0) { + range_to_mtrr_with_hole(state, basek); + + state->range_startk = 0; + state->range_sizek = 0; + } + /* Allocate an msr */ + state->range_startk = basek; + state->range_sizek = sizek; +} + +/* mininum size of mtrr block that can take hole */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* granity of mtrr of block */ +static u64 mtrr_gran_size __initdata = (64ULL<<20); + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits) +{ + struct var_mtrr_state var_state; + int i; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.address_bits = address_bits; + var_state.chunk_sizek = mtrr_chunk_size >> 10; + var_state.gran_sizek = mtrr_gran_size >> 10; + + /* Write the range etc */ + for (i = 0; i < nr_range; i++) + set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + + /* Write the last range */ + range_to_mtrr_with_hole(&var_state, 0); + printk(KERN_INFO "DONE variable MTRRs\n"); + /* Clear out the extra MTRR's */ + while (var_state.reg < num_var_ranges) + set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits); +} + +static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_WRBACK) + continue; + nr_range = add_range(range, nr_range, base, base + size - 1, 1); + } + printk(KERN_INFO "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_UNCACHABLE) + continue; + if (!size) + continue; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + printk(KERN_INFO "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + printk(KERN_INFO "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + + return nr_range; +} + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long i, base, size, def, dummy; + mtrr_type type; + struct res_range range[RANGE_NUM]; + int nr_range; + unsigned long extra_remove_base, extra_remove_size; + + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* check if we got UC entries */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + extra_remove_size = 0; + if (mtrr_tom2) { + extra_remove_base = 1 << (32 - PAGE_SHIFT); + extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base; + } + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); + + /* convert ranges to var ranges state */ + x86_setup_var_mtrrs(range, nr_range, address_bits); + + return 1; + +} + static int disable_mtrr_trim; static int __init disable_mtrr_trim_setup(char *str) @@ -729,18 +1176,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ void __init mtrr_bp_init(void) { + u32 phys_addr; init_ifs(); + phys_addr = 32; + if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; + phys_addr = 36; /* This is an AMD specific MSR, but we assume(hope?) that Intel will implement it to when they extend the address bus of the Xeon. */ if (cpuid_eax(0x80000000) >= 0x80000008) { - u32 phys_addr; phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -758,6 +1208,7 @@ void __init mtrr_bp_init(void) don't support PAE */ size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; + phys_addr = 32; } } else { switch (boot_cpu_data.x86_vendor) { @@ -791,8 +1242,13 @@ void __init mtrr_bp_init(void) if (mtrr_if) { set_num_var_ranges(); init_table(); - if (use_intel()) + if (use_intel()) { get_mtrr_state(); + + if (mtrr_cleanup(phys_addr)) + mtrr_if->set_all(); + + } } } @@ -829,9 +1285,10 @@ static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; - if (use_intel()) - mtrr_state_warn(); - else { + if (use_intel()) { + if (enable_mtrr_cleanup < 1) + mtrr_state_warn(); + } else { /* The CPUs haven't MTRR and seem to not support SMP. They have * specific drivers, we use a tricky method to support * suspend/resume for them. diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2cc77eb..2dc4ec6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); @@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if; #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) extern unsigned int num_var_ranges; +extern u64 mtrr_tom2; void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); -- cgit v1.1 From 42651f15824d003e8357693ab72c4dbb3e280836 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Tue, 29 Apr 2008 01:59:49 -0700 Subject: x86: fix trimming e820 with MTRR holes. converting MTRR layout from continous to discrete, some time could run out of MTRRs. So add gran_sizek to prevent that by dumpping small RAM piece less than gran_sizek. previous trimming only can handle highest_pfn from mtrr to end_pfn from e820. when have more than 4g RAM installed, there will be holes below 4g. so need to check ram below 4g is coverred well. need to be applied after [PATCH] x86: mtrr cleanup for converting continuous to discrete layout v7 Signed-off-by: Yinghai Lu <yinghai.lu@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/mtrr/main.c | 101 +++++++++++++++++++++++++++++++++------- arch/x86/kernel/e820_32.c | 7 ++- arch/x86/kernel/e820_64.c | 6 ++- include/asm-x86/e820_32.h | 2 +- include/asm-x86/e820_64.h | 2 +- 5 files changed, 97 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8a6f68b..9ab5c16 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1095,6 +1095,17 @@ int __init amd_special_default_mtrr(void) return 0; } +static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + return update_memory_range(trim_start, trim_size, E820_RAM, + E820_RESERVED); +} /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -1110,8 +1121,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - u64 trim_start, trim_size; + struct res_range range[RANGE_NUM]; + int nr_range; + u64 total_real_trim_size; + int changed; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: @@ -1123,9 +1139,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) if (def != MTRR_TYPE_UNCACHABLE) return 0; - if (amd_special_default_mtrr()) - return 0; - /* Find highest cached pfn */ for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1145,26 +1158,80 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - if (highest_pfn < end_pfn) { + /* check entries number */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* no entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; + if (highest_pfn < range[nr_range].end + 1) + highest_pfn = range[nr_range].end + 1; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + changed = 0; + total_real_trim_size = 0; + + /* check the top at first */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) { + total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); + } + + if (total_real_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %luMB of RAM.\n", - (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT)); + " all of memory, losing %lluMB of RAM.\n", + total_real_trim_size >> 20); WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); - trim_start = highest_pfn; - trim_start <<= PAGE_SHIFT; - trim_size = end_pfn; - trim_size <<= PAGE_SHIFT; - trim_size -= trim_start; - update_memory_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + printk(KERN_INFO "update e820 for mtrr -- end_pfn\n"); update_e820(); - return 1; + changed = 1; } - return 0; + total_real_trim_size = 0; + if (range[0].start) + total_real_trim_size += real_trim_memory(0, range[0].start); + + for (i = 0; i < nr_range - 1; i--) { + if (range[i].end + 1 < range[i+1].start) + total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); + } + + if (total_real_trim_size) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %lluMB of RAM.\n", + total_real_trim_size >> 20); + + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr -- holes\n"); + update_e820(); + changed = 1; + } + + return changed; } /** diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 31ea2bb..857f706 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -783,10 +783,11 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); -void __init update_memory_range(u64 start, u64 size, unsigned old_type, +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { int i; + u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -798,6 +799,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, /* totally covered? */ if (ei->addr >= start && ei->size <= size) { ei->type = new_type; + real_updated_size += ei->size; continue; } /* partially covered */ @@ -807,7 +809,10 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, continue; add_memory_region(final_start, final_end - final_start, new_type); + real_updated_size += final_end - final_start; } + + return real_updated_size; } void __init finish_e820_parsing(void) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 124480c..848b2cd 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -829,10 +829,11 @@ void __init finish_e820_parsing(void) } } -void __init update_memory_range(u64 start, u64 size, unsigned old_type, +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { int i; + u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -844,6 +845,7 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, /* totally covered? */ if (ei->addr >= start && ei->size <= size) { ei->type = new_type; + real_updated_size += ei->size; continue; } /* partially covered */ @@ -853,7 +855,9 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type, continue; add_memory_region(final_start, final_end - final_start, new_type); + real_updated_size += final_end - final_start; } + return real_updated_size; } void __init update_e820(void) diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index e1f10c6..af0711b 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -31,7 +31,7 @@ extern void propagate_e820_map(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void add_memory_region(unsigned long long start, unsigned long long size, int type); -extern void update_memory_range(u64 start, u64 size, unsigned old_type, +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 71c4d68..6ae3e28 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -21,7 +21,7 @@ extern unsigned long find_e820_area_size(unsigned long start, unsigned long align); extern void add_memory_region(unsigned long start, unsigned long size, int type); -extern void update_memory_range(u64 start, u64 size, unsigned old_type, +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void setup_memory_region(void); extern void contig_e820_setup(void); -- cgit v1.1 From 8a374026c265476b1acfc3c186a66d59ebdb2cda Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Tue, 29 Apr 2008 20:25:16 -0700 Subject: x86: fix trimming e820 with MTRR holes. - fix v2: process hole then end_pfn fix update_memory_range with whole cover comparing Signed-off-by: Yinghai Lu <yinghai.lu@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/mtrr/main.c | 44 ++++++++++++++--------------------------- arch/x86/kernel/e820_32.c | 3 ++- arch/x86/kernel/e820_64.c | 3 ++- 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 9ab5c16..5a2a4c1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1098,11 +1098,12 @@ int __init amd_special_default_mtrr(void) static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; - trim_start = start_pfn; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; + return update_memory_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } @@ -1124,7 +1125,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) struct res_range range[RANGE_NUM]; int nr_range; u64 total_real_trim_size; - int changed; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1189,49 +1189,35 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - changed = 0; - total_real_trim_size = 0; - - /* check the top at first */ - i = nr_range - 1; - if (range[i].end + 1 < end_pfn) { - total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); - } - - if (total_real_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_real_trim_size >> 20); - - WARN_ON(1); - - printk(KERN_INFO "update e820 for mtrr -- end_pfn\n"); - update_e820(); - changed = 1; - } - total_real_trim_size = 0; + /* check the head */ if (range[0].start) total_real_trim_size += real_trim_memory(0, range[0].start); - - for (i = 0; i < nr_range - 1; i--) { + /* check the holes */ + for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } + /* check the top */ + i = nr_range - 1; + if (range[i].end + 1 < end_pfn) + total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_real_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" " all of memory, losing %lluMB of RAM.\n", total_real_trim_size >> 20); - WARN_ON(1); + if (enable_mtrr_cleanup < 1) + WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr -- holes\n"); + printk(KERN_INFO "update e820 for mtrr\n"); update_e820(); - changed = 1; + + return 1; } - return changed; + return 0; } /** diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 857f706..751d517 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -797,7 +797,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, if (ei->type != old_type) continue; /* totally covered? */ - if (ei->addr >= start && ei->size <= size) { + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { ei->type = new_type; real_updated_size += ei->size; continue; diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 848b2cd..c45b4de 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -843,7 +843,8 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, if (ei->type != old_type) continue; /* totally covered? */ - if (ei->addr >= start && ei->size <= size) { + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { ei->type = new_type; real_updated_size += ei->size; continue; -- cgit v1.1 From f5098d62c1d1cede8ff23d01bbf50a421f110562 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Tue, 29 Apr 2008 20:25:58 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout v8 - fix v9: address format change requests by Ingo more case handling in range_to_var_with_hole Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 9 +-- arch/x86/kernel/cpu/mtrr/main.c | 164 ++++++++++++++++++++++------------------ 2 files changed, 94 insertions(+), 79 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 97a1764..f417fe3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1110,13 +1110,12 @@ config MTRR_SANITIZER If unsure, say Y. config MTRR_SANITIZER_ENABLE_DEFAULT - def_bool y - prompt "Enable MTRR cleanup by default" + int "MTRR cleanup enable value (0-1)" + range 0 1 + default "0" depends on MTRR_SANITIZER help - Enable mtrr cleanup by default - - If unsure, say Y. + Enable mtrr cleanup default value config X86_PAT bool diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5a2a4c1..79597d3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -611,17 +611,9 @@ static struct sysdev_driver mtrr_sysdev_driver = { }; #ifdef CONFIG_MTRR_SANITIZER - -#ifdef CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT -static int enable_mtrr_cleanup __initdata = 1; -#else -static int enable_mtrr_cleanup __initdata; -#endif - +static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; #else - static int enable_mtrr_cleanup __initdata = -1; - #endif static int __init disable_mtrr_cleanup_setup(char *str) @@ -640,6 +632,7 @@ static int __init enable_mtrr_cleanup_setup(char *str) } early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +/* should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 struct res_range { @@ -647,13 +640,27 @@ struct res_range { unsigned long end; }; -static int __init add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end, int merge) +static int __init +add_range(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) { - int i; + /* out of slots */ + if (nr_range >= RANGE_NUM) + return nr_range; - if (!merge) - goto addit; + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +static int __init +add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, + unsigned long end) +{ + int i; /* try to merge it with old one */ for (i = 0; i < nr_range; i++) { @@ -676,24 +683,14 @@ static int __init add_range(struct res_range *range, int nr_range, unsigned long return nr_range; } -addit: /* need to add that */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; - + return add_range(range, nr_range, start, end); } -static void __init subtract_range(struct res_range *range, unsigned long start, - unsigned long end) + +static void __init +subtract_range(struct res_range *range, unsigned long start, unsigned long end) { - int i; - int j; + int i, j; for (j = 0; j < RANGE_NUM; j++) { if (!range[j].end) @@ -747,46 +744,47 @@ static int __init cmp_range(const void *x1, const void *x2) } struct var_mtrr_state { - unsigned long range_startk, range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; - unsigned address_bits; + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; + unsigned int address_bits; }; -static void __init set_var_mtrr( - unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned address_bits) +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; - unsigned address_mask_high; + u64 base, mask; if (!sizek) { fill_mtrr_var_range(reg, 0, 0, 0, 0); return; } - address_mask_high = ((1u << (address_bits - 32u)) - 1u); + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); - base_hi = basek >> 22; - base_lo = basek << 10; + base = ((u64)basek) << 10; - if (sizek < 4*1024*1024) { - mask_hi = address_mask_high; - mask_lo = ~((sizek << 10) - 1); - } else { - mask_hi = address_mask_high & (~((sizek >> 22) - 1)); - mask_lo = 0; - } + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; - base_lo |= type; - mask_lo |= 0x800; fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); } -static unsigned int __init range_to_mtrr(unsigned int reg, - unsigned long range_startk, unsigned long range_sizek, - unsigned char type, unsigned address_bits) +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type, + unsigned address_bits) { if (!range_sizek || (reg >= num_var_ranges)) return reg; @@ -794,6 +792,7 @@ static unsigned int __init range_to_mtrr(unsigned int reg, while (range_sizek) { unsigned long max_align, align; unsigned long sizek; + /* Compute the maximum size I can make a range */ if (range_startk) max_align = ffs(range_startk) - 1; @@ -818,7 +817,8 @@ static unsigned int __init range_to_mtrr(unsigned int reg, return reg; } -static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +static void __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) { unsigned long hole_basek, hole_sizek; unsigned long range0_basek, range0_sizek; @@ -848,23 +848,31 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - if ((range0_sizek == state->range_sizek) || - ((range0_basek + range0_sizek - chunk_sizek > basek) && basek)) { + if (range0_sizek == state->range_sizek) { printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); return; + } else if (basek) { + while (range0_basek + range0_sizek - chunk_sizek > basek) { + range0_sizek -= chunk_sizek; + if (!range0_sizek) + break; + } } - range0_sizek -= chunk_sizek; + if (range0_sizek > chunk_sizek) + range0_sizek -= chunk_sizek; printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); range_basek = range0_basek + range0_sizek; range_sizek = chunk_sizek; - if (range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) { + + if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) && + (range_basek + range_sizek <= basek)) { hole_sizek = range_sizek - (state->range_sizek - range0_sizek); hole_basek = range_basek + range_sizek - hole_sizek; } else @@ -880,7 +888,9 @@ static void __init range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigne } } -static void __init set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) { unsigned long basek, sizek; @@ -921,7 +931,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); /* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata = (64ULL<<20); +static u64 mtrr_gran_size __initdata = (1ULL<<20); static int __init parse_mtrr_gran_size_opt(char *p) { @@ -932,17 +942,19 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, unsigned address_bits) +static void __init +x86_setup_var_mtrrs(struct res_range *range, int nr_range, + unsigned address_bits) { struct var_mtrr_state var_state; int i; - var_state.range_startk = 0; - var_state.range_sizek = 0; - var_state.reg = 0; - var_state.address_bits = address_bits; - var_state.chunk_sizek = mtrr_chunk_size >> 10; - var_state.gran_sizek = mtrr_gran_size >> 10; + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.address_bits = address_bits; + var_state.chunk_sizek = mtrr_chunk_size >> 10; + var_state.gran_sizek = mtrr_gran_size >> 10; /* Write the range etc */ for (i = 0; i < nr_range; i++) @@ -952,11 +964,16 @@ static void __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, un range_to_mtrr_with_hole(&var_state, 0); printk(KERN_INFO "DONE variable MTRRs\n"); /* Clear out the extra MTRR's */ - while (var_state.reg < num_var_ranges) - set_var_mtrr(var_state.reg++, 0, 0, 0, var_state.address_bits); + while (var_state.reg < num_var_ranges) { + set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits); + var_state.reg++; + } } -static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) { unsigned long i, base, size; mtrr_type type; @@ -965,7 +982,7 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, mtrr_if->get(i, &base, &size, &type); if (type != MTRR_TYPE_WRBACK) continue; - nr_range = add_range(range, nr_range, base, base + size - 1, 1); + nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); } printk(KERN_INFO "After WB checking\n"); for (i = 0; i < nr_range; i++) @@ -1005,11 +1022,11 @@ static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, static int __init mtrr_cleanup(unsigned address_bits) { + unsigned long extra_remove_base, extra_remove_size; unsigned long i, base, size, def, dummy; - mtrr_type type; struct res_range range[RANGE_NUM]; + mtrr_type type; int nr_range; - unsigned long extra_remove_base, extra_remove_size; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1053,7 +1070,6 @@ static int __init mtrr_cleanup(unsigned address_bits) x86_setup_var_mtrrs(range, nr_range, address_bits); return 1; - } static int disable_mtrr_trim; -- cgit v1.1 From 12031a624af7816ec7660b82be648aa3703b4ebe Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Fri, 2 May 2008 02:40:22 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete - auto detect v4 Loop through mtrr chunk_size and gran_size from 1M to 2G to find out the optimal value so user does not need to add mtrr_chunk_size and mtrr_gran_size to the kernel command line. If optimal value is not found, print out all list to help select less optimal value. Add mtrr_spare_reg_nr= so user could set 2 instead of 1, if the card need more entries. v2: find the one with more spare entries v3: fix hole_basek offset v4: tight the compare between range and range_new loop stop with 4g Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Gabriel C <nix.or.die@googlemail.com> Cc: Mika Fischer <mika.fischer@zoopnet.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/kernel-parameters.txt | 13 +- arch/x86/Kconfig | 9 + arch/x86/kernel/cpu/mtrr/main.c | 610 +++++++++++++++++++++++++++--------- 3 files changed, 484 insertions(+), 148 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4442760..2000977 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -610,8 +610,17 @@ and is between 256 and 4096 characters. It is defined in the file that could hold holes aka. UC entries. mtrr_gran_size=nn[KMG] [X86] - used for mtrr cleanup. It is granity of mtrr block. - Big value could prevent small alignment use up MTRRs. + Used for mtrr cleanup. It is granularity of mtrr block. + Default is 1. + Large value could prevent small alignment from + using up MTRRs. + + mtrr_spare_reg_nr=n [X86] + Format: <integer> + Range: 0,7 : spare reg number + Default : 1 + Used for mtrr cleanup. It is spare mtrr entries number. + Set to 2 or more if your graphical card needs more. disable_mtrr_trim [X86, Intel and AMD only] By default the kernel will trim any uncacheable diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f417fe3..9e97615 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1117,6 +1117,15 @@ config MTRR_SANITIZER_ENABLE_DEFAULT help Enable mtrr cleanup default value +config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT + int "MTRR cleanup spare reg num (0-7)" + range 0 7 + default "1" + depends on MTRR_SANITIZER + help + mtrr cleanup spare entries default, it can be changed via + mtrr_spare_reg_nr= + config X86_PAT bool prompt "x86 PAT support" diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79597d3..e6c162c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -610,28 +610,6 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; -#ifdef CONFIG_MTRR_SANITIZER -static int enable_mtrr_cleanup __initdata = CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; -#else -static int enable_mtrr_cleanup __initdata = -1; -#endif - -static int __init disable_mtrr_cleanup_setup(char *str) -{ - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 0; - return 0; -} -early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); - -static int __init enable_mtrr_cleanup_setup(char *str) -{ - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 1; - return 0; -} -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); - /* should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 @@ -702,13 +680,15 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) continue; } - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { range[j].start = end + 1; continue; } - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { range[j].end = start - 1; continue; } @@ -743,18 +723,123 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __init +x86_get_mtrr_mem_range(struct res_range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long i, base, size; + mtrr_type type; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, nr_range, base, + base + size - 1); + } + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* take out UC ranges */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + subtract_range(range, base, base + size - 1); + } + if (extra_remove_size) + subtract_range(range, extra_remove_base, + extra_remove_base + extra_remove_size - 1); + + /* get new range num */ + nr_range = 0; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + nr_range++; + } + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + + /* clear those is not used */ + for (i = nr_range; i < RANGE_NUM; i++) + memset(&range[i], 0, sizeof(range[i])); + + return nr_range; +} + +static struct res_range __initdata range[RANGE_NUM]; + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +{ + unsigned long sum; + int i; + + sum = 0; + for (i = 0; i < nr_range; i++) + sum += range[i].end + 1 - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + if (enable_mtrr_cleanup != -1) + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); + struct var_mtrr_state { unsigned long range_startk; unsigned long range_sizek; unsigned long chunk_sizek; unsigned long gran_sizek; unsigned int reg; - unsigned int address_bits; }; static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -781,10 +866,34 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); } +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init +set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + static unsigned int __init range_to_mtrr(unsigned int reg, unsigned long range_startk, - unsigned long range_sizek, unsigned char type, - unsigned address_bits) + unsigned long range_sizek, unsigned char type) { if (!range_sizek || (reg >= num_var_ranges)) return reg; @@ -803,12 +912,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - printk(KERN_INFO "Setting variable MTRR %d, base: %ldMB, range: %ldMB, type %s\n", + printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, " + "range: %ldMB, type %s\n", reg, range_startk >> 10, sizek >> 10, (type == MTRR_TYPE_UNCACHABLE)?"UC": ((type == MTRR_TYPE_WRBACK)?"WB":"Other") ); - set_var_mtrr(reg++, range_startk, sizek, type, address_bits); + save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; if (reg >= num_var_ranges) @@ -817,10 +927,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, return reg; } -static void __init -range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) { unsigned long hole_basek, hole_sizek; + unsigned long second_basek, second_sizek; unsigned long range0_basek, range0_sizek; unsigned long range_basek, range_sizek; unsigned long chunk_sizek; @@ -828,64 +940,95 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek) hole_basek = 0; hole_sizek = 0; + second_basek = 0; + second_sizek = 0; chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; /* align with gran size, prevent small block used up MTRRs */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) - return; - range_sizek = ALIGN(state->range_sizek - (range_basek - state->range_startk), gran_sizek); + return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); - while (range_basek + range_sizek > (state->range_startk + state->range_sizek)) { + while (range_sizek > state->range_sizek) { range_sizek -= gran_sizek; if (!range_sizek) - return; + return 0; } - state->range_startk = range_basek; state->range_sizek = range_sizek; /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); if (range0_sizek == state->range_sizek) { - printk(KERN_INFO "rangeX: %016lx - %016lx\n", range0_basek<<10, (range0_basek + state->range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - state->range_sizek, MTRR_TYPE_WRBACK, state->address_bits); - return; - } else if (basek) { - while (range0_basek + range0_sizek - chunk_sizek > basek) { + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + range0_sizek -= chunk_sizek; + if (range0_sizek && sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { range0_sizek -= chunk_sizek; if (!range0_sizek) break; } } + if (range0_sizek) { + printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); - if (range0_sizek > chunk_sizek) - range0_sizek -= chunk_sizek; - printk(KERN_INFO "range0: %016lx - %016lx\n", range0_basek<<10, (range0_basek + range0_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - range0_sizek, MTRR_TYPE_WRBACK, state->address_bits); + } range_basek = range0_basek + range0_sizek; range_sizek = chunk_sizek; - if ((range_sizek - (state->range_sizek - range0_sizek) < (chunk_sizek >> 1)) && - (range_basek + range_sizek <= basek)) { - hole_sizek = range_sizek - (state->range_sizek - range0_sizek); - hole_basek = range_basek + range_sizek - hole_sizek; - } else + if (range_basek + range_sizek > basek && + range_basek + range_sizek <= (basek + sizek)) { + /* one hole */ + second_basek = basek; + second_sizek = range_basek + range_sizek - basek; + } + + /* if last piece, only could one hole near end */ + if ((second_basek || !basek) && + range_sizek - (state->range_sizek - range0_sizek) - second_sizek < + (chunk_sizek >> 1)) { + /* + * one hole in middle (second_sizek is 0) or at end + * (second_sizek is 0 ) + */ + hole_sizek = range_sizek - (state->range_sizek - range0_sizek) + - second_sizek; + hole_basek = range_basek + range_sizek - hole_sizek + - second_sizek; + } else { + /* fallback for big hole, or several holes */ range_sizek = state->range_sizek - range0_sizek; + second_basek = 0; + second_sizek = 0; + } - printk(KERN_INFO "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, - range_sizek, MTRR_TYPE_WRBACK, state->address_bits); + printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, + MTRR_TYPE_WRBACK); if (hole_sizek) { - printk(KERN_INFO "hole: %016lx - %016lx\n", hole_basek<<10, (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, - hole_sizek, MTRR_TYPE_UNCACHABLE, state->address_bits); + printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, + MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; } static void __init @@ -893,6 +1036,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, unsigned long size_pfn) { unsigned long basek, sizek; + unsigned long second_sizek = 0; if (state->reg >= num_var_ranges) return; @@ -901,21 +1045,19 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, sizek = size_pfn << (PAGE_SHIFT - 10); /* See if I can merge with the last range */ - if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } /* Write the range mtrrs */ - if (state->range_sizek != 0) { - range_to_mtrr_with_hole(state, basek); + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - state->range_startk = 0; - state->range_sizek = 0; - } /* Allocate an msr */ - state->range_startk = basek; - state->range_sizek = sizek; + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; } /* mininum size of mtrr block that can take hole */ @@ -931,7 +1073,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); /* granity of mtrr of block */ -static u64 mtrr_gran_size __initdata = (1ULL<<20); +static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) { @@ -942,91 +1084,84 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static void __init +static int nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init x86_setup_var_mtrrs(struct res_range *range, int nr_range, - unsigned address_bits) + u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; int i; + int num_reg; var_state.range_startk = 0; var_state.range_sizek = 0; var_state.reg = 0; - var_state.address_bits = address_bits; - var_state.chunk_sizek = mtrr_chunk_size >> 10; - var_state.gran_sizek = mtrr_gran_size >> 10; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); /* Write the range etc */ for (i = 0; i < nr_range; i++) - set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start + 1); /* Write the last range */ - range_to_mtrr_with_hole(&var_state, 0); - printk(KERN_INFO "DONE variable MTRRs\n"); + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + printk(KERN_DEBUG "DONE variable MTRRs\n"); + + num_reg = var_state.reg; /* Clear out the extra MTRR's */ while (var_state.reg < num_var_ranges) { - set_var_mtrr(var_state.reg, 0, 0, 0, var_state.address_bits); + save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; } -} - -static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, - unsigned long extra_remove_base, - unsigned long extra_remove_size) -{ - unsigned long i, base, size; - mtrr_type type; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - if (type != MTRR_TYPE_WRBACK) - continue; - nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); - } - printk(KERN_INFO "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); - /* take out UC ranges */ - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - if (type != MTRR_TYPE_UNCACHABLE) - continue; - if (!size) - continue; - subtract_range(range, base, base + size - 1); - } - if (extra_remove_size) - subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); + return num_reg; +} - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } - printk(KERN_INFO "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; - /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - printk(KERN_INFO "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_INFO "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); +/* + * gran_size: 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 4G + * so we need (2+13)*6 + */ +#define NUM_RESULT 90 +#define PSHIFT (PAGE_SHIFT - 10) - return nr_range; -} +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static struct res_range __initdata range_new[RANGE_NUM]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static int __init mtrr_cleanup(unsigned address_bits) { unsigned long extra_remove_base, extra_remove_size; unsigned long i, base, size, def, dummy; - struct res_range range[RANGE_NUM]; mtrr_type type; - int nr_range; + int nr_range, nr_range_new; + u64 chunk_size, gran_size; + unsigned long range_sums, range_sums_new; + int index_good; + int num_reg_good; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1038,10 +1173,20 @@ static int __init mtrr_cleanup(unsigned address_bits) if (def != MTRR_TYPE_UNCACHABLE) return 0; + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + /* check entries number */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); + type = range_state[i].type; + size = range_state[i].size_pfn; if (type >= MTRR_NUM_TYPES) continue; if (!size) @@ -1062,15 +1207,172 @@ static int __init mtrr_cleanup(unsigned address_bits) extra_remove_size = 0; if (mtrr_tom2) { extra_remove_base = 1 << (32 - PAGE_SHIFT); - extra_remove_size = (mtrr_tom2>>PAGE_SHIFT) - extra_remove_base; + extra_remove_size = + (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; + } + nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, + extra_remove_size); + range_sums = sum_ranges(range, nr_range); + printk(KERN_INFO "total RAM coverred: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + int num_reg; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, + mtrr_gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, + extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + i = 0; + result[i].chunk_sizek = mtrr_chunk_size >> 10; + result[i].gran_sizek = mtrr_gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + printk(KERN_INFO " %sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?" BAD ":"", result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + result[i].num_reg, result[i].bad?"-":"", + result[i].lose_cover_sizek >> 10); + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + return 1; + } + printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " + "will find optimal one\n"); + memset(result, 0, sizeof(result[0])); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { + for (chunk_size = gran_size; chunk_size < (1ULL<<33); + chunk_size <<= 1) { + int num_reg; + + printk(KERN_INFO + "\ngran_size: %lldM chunk_size_size: %lldM\n", + gran_size >> 20, chunk_size >> 20); + if (i >= NUM_RESULT) + continue; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; + + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; + } + i++; + } } - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); - /* convert ranges to var ranges state */ - x86_setup_var_mtrrs(range, nr_range, address_bits); + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) { + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + result[i].num_reg, result[i].bad?"-":"", + result[i].lose_cover_sizek >> 10); + } - return 1; + /* try to find the optimal index */ + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) { + if (!min_loss_pfn[i]) { + num_reg_good = i; + break; + } + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + if (index_good != -1) { + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + i = index_good; + printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", + result[i].gran_sizek >> 10, + result[i].chunk_sizek >> 10); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %ldM \n", + result[i].num_reg, + result[i].lose_cover_sizek >> 10); + /* convert ranges to var ranges state */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + return 1; + } + + printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); + printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; } +#else +static int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int __initdata changed_by_mtrr_cleanup; static int disable_mtrr_trim; @@ -1111,7 +1413,8 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +static u64 __init real_trim_memory(unsigned long start_pfn, + unsigned long limit_pfn) { u64 trim_start, trim_size; trim_start = start_pfn; @@ -1138,9 +1441,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - struct res_range range[RANGE_NUM]; int nr_range; - u64 total_real_trim_size; + u64 total_trim_size; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1155,11 +1457,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* Find highest cached pfn */ + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; if (highest_pfn < base + size) highest_pfn = base + size; } @@ -1177,9 +1490,10 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* check entries number */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); + type = range_state[i].type; if (type >= MTRR_NUM_TYPES) continue; + size = range_state[i].size_pfn; if (!size) type = MTRR_NUM_TYPES; num[type]++; @@ -1205,26 +1519,28 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); - total_real_trim_size = 0; + total_trim_size = 0; /* check the head */ if (range[0].start) - total_real_trim_size += real_trim_memory(0, range[0].start); + total_trim_size += real_trim_memory(0, range[0].start); /* check the holes */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) - total_real_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); + total_trim_size += real_trim_memory(range[i].end + 1, + range[i+1].start); } /* check the top */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) - total_real_trim_size += real_trim_memory(range[i].end + 1, end_pfn); + total_trim_size += real_trim_memory(range[i].end + 1, + end_pfn); - if (total_real_trim_size) { + if (total_trim_size) { printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" " all of memory, losing %lluMB of RAM.\n", - total_real_trim_size >> 20); + total_trim_size >> 20); - if (enable_mtrr_cleanup < 1) + if (!changed_by_mtrr_cleanup) WARN_ON(1); printk(KERN_INFO "update e820 for mtrr\n"); @@ -1314,8 +1630,10 @@ void __init mtrr_bp_init(void) if (use_intel()) { get_mtrr_state(); - if (mtrr_cleanup(phys_addr)) + if (mtrr_cleanup(phys_addr)) { + changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); + } } } @@ -1355,7 +1673,7 @@ static int __init mtrr_init_finialize(void) if (!mtrr_if) return 0; if (use_intel()) { - if (enable_mtrr_cleanup < 1) + if (!changed_by_mtrr_cleanup) mtrr_state_warn(); } else { /* The CPUs haven't MTRR and seem to not support SMP. They have -- cgit v1.1 From 833e78bfeeef628f0201349a0a05a54f48f07884 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Mon, 5 May 2008 15:57:38 -0700 Subject: x86: process fam 10h like k8 with fixed mtrr setting otherwise fixed MTRR for family 10h may not be changed. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 5aae648..a83f5cd7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -342,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) if (lo != msrwords[0] || hi != msrwords[1]) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 15 && + (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) && ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); -- cgit v1.1 From b79cd8f1268bab57ff85b19d131f7f23deab2dee Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 11 May 2008 00:30:15 -0700 Subject: x86: make e820.c to have common functions remove the duplicated copy of these functions. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 475 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 411 +-------------------------------------- arch/x86/kernel/e820_64.c | 444 ------------------------------------------ arch/x86/kernel/setup_32.c | 2 +- include/asm-x86/e820.h | 14 ++ include/asm-x86/e820_32.h | 12 -- include/asm-x86/e820_64.h | 12 -- 8 files changed, 496 insertions(+), 876 deletions(-) create mode 100644 arch/x86/kernel/e820.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..5369a4e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o -obj-y += bootflag.o e820_$(BITS).o +obj-y += bootflag.o e820_$(BITS).o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 0000000..2cb686f --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,475 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/string.h> +#include <linux/kexec.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/pfn.h> + +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/setup.h> + +struct e820map e820; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(u64 start, u64 size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)\n"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)\n"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following + (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing = 1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= E820MAX) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +int __init copy_e820_map(struct e820entry *biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + add_memory_region(start, size, type); + } while (biosmap++, --nr_map); + return 0; +} + +u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + real_updated_size += final_end - final_start; + } + return real_updated_size; +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + +#ifdef CONFIG_X86_64 + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); + } +#endif + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} + diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 751d517..dfe2575 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -16,21 +16,6 @@ #include <asm/e820.h> #include <asm/setup.h> -struct e820map e820; -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -254,223 +239,6 @@ void __init e820_mark_nosave_regions(void) } #endif -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} /* add_memory_region */ - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) { - return -1; - } - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { - return -1; - } - - /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i=0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; i<overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - - return 0; -} - /* * Find the highest page frame number we have available */ @@ -535,86 +303,12 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn) } } -void __init e820_register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; - print_memory_map("limit_regions start"); + e820_print_map("limit_regions start"); for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -633,62 +327,10 @@ void __init limit_regions(unsigned long long size) e820.nr_map = i + 1; e820.map[i].size -= current_addr - size; } - print_memory_map("limit_regions endfor"); + e820_print_map("limit_regions endfor"); return; } - print_memory_map("limit_regions endfunc"); -} - -/* - * This function checks if any part of the range <start,end> is mapped - * with type. - */ -int -e820_any_mapped(u64 start, u64 end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - const struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - - /* - * This function checks if the entire range <start,end> is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; + e820_print_map("limit_regions endfunc"); } /* Overridden in paravirt.c if CONFIG_PARAVIRT */ @@ -700,7 +342,7 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); + e820_print_map(memory_setup()); } static int __initdata user_defined_memmap; @@ -783,55 +425,12 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - - return real_updated_size; -} void __init finish_e820_parsing(void) { if (user_defined_memmap) { printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); + e820_print_map("user"); } } -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - print_memory_map("modified"); -} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index c45b4de..28a14eb 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -29,8 +29,6 @@ #include <asm/kdebug.h> #include <asm/trampoline.h> -struct e820map e820; - /* * PFN of last memory page. */ @@ -176,62 +174,6 @@ again: } return changed; } -/* - * This function checks if any part of the range <start,end> is mapped - * with type. - */ -int -e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(e820_any_mapped); - -/* - * This function checks if the entire range <start,end> is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init e820_all_mapped(unsigned long start, unsigned long end, - unsigned type) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - - /* if the region is at the beginning of <start,end> we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* - * if start is now at or beyond end, we're done, full - * coverage - */ - if (start >= end) - return 1; - } - return 0; -} /* * Find a free area with specified alignment in a specific range. @@ -435,24 +377,6 @@ e820_register_active_regions(int nid, unsigned long start_pfn, } /* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - -/* * Find the hole size (in bytes) in the memory range. * @start: starting address of the memory range to scan * @end: ending address of the memory range to scan @@ -473,266 +397,6 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -static void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: - printk(KERN_CONT "(usable)\n"); - break; - case E820_RESERVED: - printk(KERN_CONT "(reserved)\n"); - break; - case E820_ACPI: - printk(KERN_CONT "(ACPI data)\n"); - break; - case E820_NVS: - printk(KERN_CONT "(ACPI NVS)\n"); - break; - default: - printk(KERN_CONT "type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following - (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i = 0; i < old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i = 0; i < 2 * old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i = 0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + - biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries = 0; /* number of entries in the overlap table */ - new_bios_entry = 0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - - /* loop through change-points, determining affect on the new bios map */ - for (chgidx = 0; chgidx < chg_nr; chgidx++) { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == - change_point[chgidx]->pbios->addr) { - /* - * add map entry to overlap list (> 1 entry - * implies an overlap) - */ - overlap_list[overlap_entries++] = - change_point[chgidx]->pbios; - } else { - /* - * remove entry from list (order independent, - * so swap with last) - */ - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == - change_point[chgidx]->pbios) - overlap_list[i] = - overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* - * if there are overlapping entries, decide which - * "type" to use (larger value takes precedence -- - * 1=usable, 2,3,4,4+=unusable) - */ - current_type = 0; - for (i = 0; i < overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* - * continue building up new bios map based on this - * information - */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* - * move forward only if the new size - * was non-zero - */ - if (new_bios[new_bios_entry].size != 0) - /* - * no more space left for new - * bios entries ? - */ - if (++new_bios_entry >= E820MAX) - break; - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = - change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr = change_point[chgidx]->addr; - } - last_type = current_type; - } - } - /* retain count for new bios entries */ - new_nr = new_bios_entry; - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - add_memory_region(start, size, type); - } while (biosmap++, --nr_map); - return 0; -} - static void early_panic(char *msg) { early_printk(msg); @@ -829,114 +493,6 @@ void __init finish_e820_parsing(void) } } -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) -{ - int i; - u64 real_updated_size = 0; - - BUG_ON(old_type == new_type); - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 final_start, final_end; - if (ei->type != old_type) - continue; - /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { - ei->type = new_type; - real_updated_size += ei->size; - continue; - } - /* partially covered */ - final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); - if (final_start >= final_end) - continue; - add_memory_region(final_start, final_end - final_start, - new_type); - real_updated_size += final_end - final_start; - } - return real_updated_size; -} - -void __init update_e820(void) -{ - u8 nr_map; - - nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) - return; - e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); - e820_print_map("modified"); -} - -unsigned long pci_mem_start = 0xaeedbabe; -EXPORT_SYMBOL(pci_mem_start); - -/* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. - */ -__init void e820_setup_gap(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long last; - int i; - int found = 0; - - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - found = 1; - } - } - if (start < last) - last = start; - } - - if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " - "address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource " - "registers may break!\n"); - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); -} - int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) { int i; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b54c79c..5faeab6 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -875,7 +875,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif - e820_register_memory(); + e820_setup_gap(); e820_mark_nosave_regions(); #ifdef CONFIG_VT diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 7004251..b5b519f 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -20,6 +20,20 @@ struct e820map { __u32 nr_map; struct e820entry map[E820MAX]; }; + +extern struct e820map e820; + +extern int e820_any_mapped(u64 start, u64 end, unsigned type); +extern int e820_all_mapped(u64 start, u64 end, unsigned type); +extern void add_memory_region(u64 start, u64 size, int type); +extern void e820_print_map(char *who); +extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); +extern int copy_e820_map(struct e820entry *biosmap, int nr_map); +extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type); +extern void update_e820(void); +extern void e820_setup_gap(void); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index af0711b..9576b43 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,19 +21,8 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern void propagate_e820_map(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); -extern void add_memory_region(unsigned long long start, - unsigned long long size, int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); -extern void e820_register_memory(void); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, @@ -47,6 +36,5 @@ static inline void e820_mark_nosave_regions(void) } #endif - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 6ae3e28..9fac77e 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -19,34 +19,22 @@ extern unsigned long find_e820_area(unsigned long start, unsigned long end, extern unsigned long find_e820_area_size(unsigned long start, unsigned long *sizep, unsigned long align); -extern void add_memory_region(unsigned long start, unsigned long size, - int type); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, - unsigned new_type); extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); extern void e820_mark_nosave_regions(void); -extern int e820_any_mapped(unsigned long start, unsigned long end, - unsigned type); -extern int e820_all_mapped(unsigned long start, unsigned long end, - unsigned type); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); extern int is_memory_all_valid(unsigned long start, unsigned long end); extern unsigned long e820_hole_size(unsigned long start, unsigned long end); -extern void e820_setup_gap(void); extern void e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); extern void finish_e820_parsing(void); -extern struct e820map e820; -extern void update_e820(void); - extern void reserve_early(unsigned long start, unsigned long end, char *name); extern void free_early(unsigned long start, unsigned long end); extern void early_res_to_bootmem(unsigned long start, unsigned long end); -- cgit v1.1 From 8004dd965b13b01a96def054d420f6df7ff22d53 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 12 May 2008 17:40:39 -0700 Subject: x86: amd opteron TOM2 mask val fix there is a typo in the mask value, need to remove that extra 0, to avoid 4bit clearing. Signed-off-by: Yinghal Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/pci/k8-bus_64.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a83f5cd7..509bd3d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -233,7 +233,7 @@ void __init get_mtrr_state(void) mtrr_tom2 = high; mtrr_tom2 <<= 32; mtrr_tom2 |= low; - mtrr_tom2 &= 0xffffff8000000ULL; + mtrr_tom2 &= 0xffffff800000ULL; } if (mtrr_show) { int high_width; diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c index 5c2799c..bfefdf0 100644 --- a/arch/x86/pci/k8-bus_64.c +++ b/arch/x86/pci/k8-bus_64.c @@ -384,7 +384,7 @@ static int __init early_fill_mp_bus_info(void) /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); + end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); if (end < (1ULL<<32)) update_range(range, 0, end - 1); @@ -478,7 +478,7 @@ static int __init early_fill_mp_bus_info(void) /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); + end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); update_range(range, 1ULL<<32, end - 1); } -- cgit v1.1 From 3f03c54a34fa3da680b3341dd3ba965ef3394bd1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 9 May 2008 22:40:52 -0700 Subject: x86: mtrr cleanup for converting continuous to discrete layout - fix #2 disable the noisy print out. also use the one the less spare mtrr reg. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mtrr/main.c | 79 +++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index e6c162c..0642201 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -730,6 +730,7 @@ struct var_mtrr_range_state { }; struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static int __initdata debug_print; static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -748,10 +749,12 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, nr_range = add_range_with_merge(range, nr_range, base, base + size - 1); } - printk(KERN_DEBUG "After WB checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + if (debug_print) { + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } /* take out UC ranges */ for (i = 0; i < num_var_ranges; i++) { @@ -775,17 +778,21 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; nr_range++; } - printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", - range[i].start, range[i].end + 1); + if (debug_print) { + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + range[i].start, range[i].end + 1); + } /* sort the ranges */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - printk(KERN_DEBUG "After sorting\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + if (debug_print) { + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } /* clear those is not used */ for (i = nr_range; i < RANGE_NUM; i++) @@ -912,12 +919,13 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - printk(KERN_DEBUG "Setting variable MTRR %d, base: %ldMB, " - "range: %ldMB, type %s\n", - reg, range_startk >> 10, sizek >> 10, - (type == MTRR_TYPE_UNCACHABLE)?"UC": - ((type == MTRR_TYPE_WRBACK)?"WB":"Other") - ); + if (debug_print) + printk(KERN_DEBUG "Setting variable MTRR %d, " + "base: %ldMB, range: %ldMB, type %s\n", + reg, range_startk >> 10, sizek >> 10, + (type == MTRR_TYPE_UNCACHABLE)?"UC": + ((type == MTRR_TYPE_WRBACK)?"WB":"Other") + ); save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; @@ -963,7 +971,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); if (range0_sizek == state->range_sizek) { - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", range0_basek<<10, + if (debug_print) + printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", + range0_basek<<10, (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); @@ -980,7 +990,9 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } if (range0_sizek) { - printk(KERN_DEBUG "range0: %016lx - %016lx\n", range0_basek<<10, + if (debug_print) + printk(KERN_DEBUG "range0: %016lx - %016lx\n", + range0_basek<<10, (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); @@ -1016,13 +1028,15 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_sizek = 0; } - printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, (range_basek + range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); if (hole_sizek) { - printk(KERN_DEBUG "hole: %016lx - %016lx\n", hole_basek<<10, - (hole_basek + hole_sizek)<<10); + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); @@ -1120,7 +1134,6 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, /* Write the last range */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); - printk(KERN_DEBUG "DONE variable MTRRs\n"); num_reg = var_state.reg; /* Clear out the extra MTRR's */ @@ -1219,6 +1232,7 @@ static int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; + debug_print = 1; /* convert ranges to var ranges state */ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, mtrr_gran_size); @@ -1242,8 +1256,8 @@ static int __init mtrr_cleanup(unsigned address_bits) result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; - printk(KERN_INFO " %sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?" BAD ":"", result[i].gran_sizek >> 10, + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", result[i].num_reg, result[i].bad?"-":"", @@ -1254,6 +1268,7 @@ static int __init mtrr_cleanup(unsigned address_bits) } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); + debug_print = 0; memset(result, 0, sizeof(result[0])); } @@ -1265,9 +1280,10 @@ static int __init mtrr_cleanup(unsigned address_bits) chunk_size <<= 1) { int num_reg; - printk(KERN_INFO + if (debug_print) + printk(KERN_INFO "\ngran_size: %lldM chunk_size_size: %lldM\n", - gran_size >> 20, chunk_size >> 20); + gran_size >> 20, chunk_size >> 20); if (i >= NUM_RESULT) continue; @@ -1310,10 +1326,10 @@ static int __init mtrr_cleanup(unsigned address_bits) /* print out all */ for (i = 0; i < NUM_RESULT; i++) { - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", + printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", result[i].num_reg, result[i].bad?"-":"", result[i].lose_cover_sizek >> 10); } @@ -1322,7 +1338,7 @@ static int __init mtrr_cleanup(unsigned address_bits) if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; num_reg_good = -1; - for (i = 1; i < num_var_ranges + 1 - nr_mtrr_spare_reg; i++) { + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) { num_reg_good = i; break; @@ -1344,10 +1360,10 @@ static int __init mtrr_cleanup(unsigned address_bits) if (index_good != -1) { printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", + printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", result[i].gran_sizek >> 10, result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %ldM \n", + printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", result[i].num_reg, result[i].lose_cover_sizek >> 10); /* convert ranges to var ranges state */ @@ -1355,6 +1371,7 @@ static int __init mtrr_cleanup(unsigned address_bits) chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; + debug_print = 1; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); set_var_mtrr_all(address_bits); return 1; -- cgit v1.1 From e3f8ba81fdce852abe36a33eed7b243c4a0acfac Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:04 -0700 Subject: x86 boot: include missing smp.h header The patch: x86: convert cpu_to_apicid to be a per cpu variable introduced a dependency of ipi.h on smp.h in x86 builds with an allnoconfig. Including smp.h in ipi.h fixes the build error: In file included from arch/x86/kernel/traps_64.c:48: include/asm/ipi.h: In function 'send_IPI_mask_sequence': include/asm/ipi.h:114: error: 'per_cpu__x86_cpu_to_apicid' undeclared (first use in this function) Signed-off-by: Paul Jackson <pj@sgi.com> Cc: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/ipi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86/ipi.h b/include/asm-x86/ipi.h index ecc80f3..196d63c 100644 --- a/include/asm-x86/ipi.h +++ b/include/asm-x86/ipi.h @@ -20,6 +20,7 @@ #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/smp.h> /* * the following functions deal with sending IPIs between CPUs. -- cgit v1.1 From e9197bf0114661195bee35e7795cfc42164d9b2c Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:10 -0700 Subject: x86 boot: remove some unused extern function declarations Remove three extern declarations for routines that don't exist. Fix a typo in a comment. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/numa_64.c | 2 +- include/linux/efi.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d5..afb07ff 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, else bootmap_start = round_up(start, PAGE_SIZE); /* - * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like + * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE */ bootmap = early_node_mem(nodeid, bootmap_start, end, diff --git a/include/linux/efi.h b/include/linux/efi.h index a5f359a..807373d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -287,7 +287,6 @@ efi_guid_unparse(efi_guid_t *guid, char *out) extern void efi_init (void); extern void *efi_get_pal_addr (void); extern void efi_map_pal_code (void); -extern void efi_map_memmap(void); extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg); extern void efi_gettimeofday (struct timespec *ts); extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if possible */ @@ -295,14 +294,11 @@ extern u64 efi_get_iobase (void); extern u32 efi_mem_type (unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); -extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, - u64 attr); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); extern unsigned long efi_get_time(void); extern int efi_set_rtc_mmss(unsigned long nowtime); -extern int is_available_memory(efi_memory_desc_t * md); extern struct efi_memory_map memmap; /** -- cgit v1.1 From cb5dd7c104c64d7ba8ff4dd3eec3d9b92c378937 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:16 -0700 Subject: x86 boot: add header comment to dmi.h stating what it is The "dmi.h" file did not state anywhere in the file what "DMI" was. For those who know, it's obvious. For the rest of us, I added a brief opening comment. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- drivers/firmware/dmi_scan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index c5e3ed7..455575b 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -8,6 +8,11 @@ #include <linux/slab.h> #include <asm/dmi.h> +/* + * DMI stands for "Desktop Management Interface". It is part + * of and an antecedent to, SMBIOS, which stands for System + * Management BIOS. See further: http://www.dmtf.org/standards + */ static char dmi_empty_string[] = " "; static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s) -- cgit v1.1 From c801ed3860fe2f84281d4cae4c3e6ca87e81e890 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:23 -0700 Subject: x86 boot: simplify pageblock_bits enum declaration The use of #defines with '##' pre-processor concatenation is a useful way to form several symbol names with a common pattern. But when there is just a single name obtained from that #define, it's just obfuscation. Better to just write the plain symbol name, as is. The following patch is a result of my wasting ten minutes looking through the kernel to figure out what 'PB_migrate_end' meant, and forgetting what I came to do, by the time I figured out that the #define PB_range macro defined it. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/linux/pageblock-flags.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e875905..e8c0612 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -25,13 +25,11 @@ #include <linux/types.h> -/* Macro to aid the definition of ranges of bits */ -#define PB_range(name, required_bits) \ - name, name ## _end = (name + required_bits) - 1 - /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_range(PB_migrate, 3), /* 3 bits required for migrate types */ + PB_migrate, + PB_migrate_end = PB_migrate + 3 - 1, + /* 3 bits required for migrate types */ NR_PAGEBLOCK_BITS }; -- cgit v1.1 From b25e31cec7788ccba5749d10f8f4343fffff4750 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:28 -0700 Subject: x86 boot: minor code format fixes in e820 and efi routines Standardize a few pointer declarations to not have the extra space after the '*' character. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_64.c | 2 +- arch/x86/kernel/efi_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 28a14eb..c10b4ae 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -404,7 +404,7 @@ static void early_panic(char *msg) } /* We're not void only for x86 32-bit compat */ -char * __init machine_specific_memory_setup(void) +char *__init machine_specific_memory_setup(void) { char *who = "BIOS-e820"; /* diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index d0060fd..21a7b75 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -103,7 +103,7 @@ void __init efi_reserve_bootmem(void) memmap.nr_map * memmap.desc_size); } -void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped __initdata; unsigned i, pages; -- cgit v1.1 From c3965bd15118742d72b4bc1a290d37b3f081eb98 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:34 -0700 Subject: x86 boot: proper use of ARRAY_SIZE instead of repeated E820MAX constant This patch is motivated by a subsequent patch which will allow for more memory map entries on EFI supported systems than can be passed via the x86 legacy BIOS E820 interface. The legacy interface is limited to E820MAX == 128 memory entries, and that "E820MAX" manifest constant was used as the size for several arrays and loops over those arrays. The primary change in this patch is to change code loop sizes over those arrays from using the constant E820MAX, to using the ARRAY_SIZE() macro evaluated for the array being looped. That way, a subsequent patch can change the size of some of these arrays, without breaking this code. This patch also adds a parameter to the sanitize_e820_map() routine, which had an implicit size for the array passed it of E820MAX entries. This new parameter explicitly passes the size of said array. Once again, this will allow a subsequent patch to change that array size for some calls to sanitize_e820_map() without breaking the code. As part of enhancing the sanitize_e820_map() interface this way, I further combined the unnecessarily distinct x86_32 and x86_64 declarations for this routine into a single, commonly used, declaration. This patch in itself should make no difference to the resulting kernel binary. [ mingo@elte.hu: merged to -tip ] Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/boot/memory.c | 3 ++- arch/x86/kernel/e820.c | 9 +++++---- arch/x86/kernel/e820_64.c | 6 ++++-- arch/x86/mach-default/setup.c | 4 +++- arch/x86/mach-voyager/setup.c | 4 +++- include/asm-x86/e820.h | 3 ++- include/asm-x86/setup.h | 1 - 7 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index acad32e..53165c9 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -13,6 +13,7 @@ */ #include "boot.h" +#include <linux/kernel.h> #define SMAP 0x534d4150 /* ASCII "SMAP" */ @@ -53,7 +54,7 @@ static int detect_memory_e820(void) count++; desc++; - } while (next && count < E820MAX); + } while (next && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2cb686f..2396b9d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -95,7 +95,7 @@ void __init add_memory_region(u64 start, u64 size, int type) { int x = e820.nr_map; - if (x == E820MAX) { + if (x == ARRAY_SIZE(e820.map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } @@ -142,7 +142,8 @@ void __init e820_print_map(char *who) * replaces the original e820 map with a new one, removing overlaps. * */ -int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, + char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -314,7 +315,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) * no more space left for new * bios entries ? */ - if (++new_bios_entry >= E820MAX) + if (++new_bios_entry >= max_nr_map) break; } if (current_type != 0) { @@ -403,7 +404,7 @@ void __init update_e820(void) u8 nr_map; nr_map = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr_map)) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) return; e820.nr_map = nr_map; printk(KERN_INFO "modified physical RAM map:\n"); diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index c10b4ae..58dc6ee 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -413,7 +413,9 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); @@ -484,7 +486,7 @@ void __init finish_e820_parsing(void) if (userdef) { char nr = e820.nr_map; - if (sanitize_e820_map(e820.map, &nr) < 0) + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); e820.nr_map = nr; diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 0c28a07..38a856c 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -163,7 +163,9 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 5ae5466..662b5c0 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -111,7 +111,9 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index b5b519f..65c891d 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -27,7 +27,8 @@ extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void add_memory_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); -extern int sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); +extern int +sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index fa6763a..ffa0f54 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -56,7 +56,6 @@ char * __init machine_specific_memory_setup(void); char *memory_setup(void); int __init copy_e820_map(struct e820entry *biosmap, int nr_map); -int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map); void __init add_memory_region(unsigned long long start, unsigned long long size, int type); -- cgit v1.1 From 028b785888c523baccdf27af0cdbf1deb92edec0 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:40 -0700 Subject: x86 boot: extend some internal memory map arrays to handle larger EFI input Extend internal boot time memory tables to allow for up to three entries per node, which may be larger than the 128 E820MAX entries handled by the legacy BIOS E820 interface. The EFI interface, if present, is capable of passing memory map entries for these larger node counts. This patch requires an earlier patch that rewrote code depending on these array sizes from using E820MAX explicitly to size loops, to instead using ARRAY_SIZE() of the applicable array. Another patch following this one will provide the code to pick up additional memory entries passed via the EFI interface from the BIOS and insert them in the following, now enlarged, arrays. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 8 ++++---- include/asm-x86/e820.h | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2396b9d..3f7777b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,10 +149,10 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; +static struct change_member change_point_list[2*E820_X_MAX] __initdata; +static struct change_member *change_point[2*E820_X_MAX] __initdata; +static struct e820entry *overlap_list[E820_X_MAX] __initdata; +static struct e820entry new_bios[E820_X_MAX] __initdata; struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 65c891d..ab18457 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -2,6 +2,41 @@ #define __ASM_E820_H #define E820MAP 0x2d0 /* our map */ #define E820MAX 128 /* number of entries in E820MAP */ + +/* + * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the + * constrained space in the zeropage. If we have more nodes than + * that, and if we've booted off EFI firmware, then the EFI tables + * passed us from the EFI firmware can list more nodes. Size our + * internal memory map tables to have room for these additional + * nodes, based on up to three entries per node for which the + * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT), + * plus E820MAX, allowing space for the possible duplicate E820 + * entries that might need room in the same arrays, prior to the + * call to sanitize_e820_map() to remove duplicates. The allowance + * of three memory map entries per node is "enough" entries for + * the initial hardware platform motivating this mechanism to make + * use of additional EFI map entries. Future platforms may want + * to allow more than three entries per node or otherwise refine + * this size. + */ + +/* + * Odd: 'make headers_check' complains about numa.h if I try + * to collapse the next two #ifdef lines to a single line: + * #if defined(__KERNEL__) && defined(CONFIG_EFI) + */ +#ifdef __KERNEL__ +#ifdef CONFIG_EFI +#include <linux/numa.h> +#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) +#else /* ! CONFIG_EFI */ +#define E820_X_MAX E820MAX +#endif +#else /* ! __KERNEL__ */ +#define E820_X_MAX E820MAX +#endif + #define E820NR 0x1e8 /* # entries in E820MAP */ #define E820_RAM 1 @@ -18,7 +53,7 @@ struct e820entry { struct e820map { __u32 nr_map; - struct e820entry map[E820MAX]; + struct e820entry map[E820_X_MAX]; }; extern struct e820map e820; -- cgit v1.1 From 6e9bcc796b120d17b08dde7ab958b82ddb899889 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:46 -0700 Subject: x86 boot: change sanitize_e820_map parameter from byte to int to allow bigger memory maps The map size counter passed into, and back out of, sanitize_e820_map(), was an eight bit type (char or u8), as derived from its origins in legacy BIOS E820 structures. This patch changes that type to an 'int', to allow this sanitize routine to also be used on larger maps (larger than the 256 count that fits in a char). The legacy BIOS E820 interface of course does not change; that remains at 8 bits for this count, holding up to E820MAX == 128 entries. But the kernel internals can handle more when those additional memory map entries are passed from the BIOS via EFI interfaces. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 5 +++-- arch/x86/kernel/e820_64.c | 7 +++++-- arch/x86/mach-default/setup.c | 5 ++++- arch/x86/mach-voyager/setup.c | 5 ++++- include/asm-x86/e820.h | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3f7777b..91abf5b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -143,7 +143,7 @@ void __init e820_print_map(char *who) * */ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, - char *pnr_map) + int *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -204,6 +204,7 @@ static struct e820entry new_bios[E820_X_MAX] __initdata; return -1; old_nr = *pnr_map; + BUG_ON(old_nr > max_nr_map); /* bail out if we find any unreasonable addresses in bios map */ for (i = 0; i < old_nr; i++) @@ -401,7 +402,7 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - u8 nr_map; + int nr_map; nr_map = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 58dc6ee..354fbb2 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -407,15 +407,18 @@ static void early_panic(char *msg) char *__init machine_specific_memory_setup(void) { char *who = "BIOS-e820"; + int new_nr; /* * Try to copy the BIOS-supplied E820-map. * * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); @@ -484,7 +487,7 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - char nr = e820.nr_map; + int nr = e820.nr_map; if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) early_panic("Invalid user supplied memory map"); diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 38a856c..56b4c39 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -153,6 +153,7 @@ late_initcall(print_ipi_mode); char * __init machine_specific_memory_setup(void) { char *who; + int new_nr; who = "BIOS-e820"; @@ -163,9 +164,11 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 662b5c0..f4aca9f 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -62,6 +62,7 @@ void __init time_init_hook(void) char *__init machine_specific_memory_setup(void) { char *who; + int new_nr; who = "NOT VOYAGER"; @@ -111,9 +112,11 @@ char *__init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ + new_nr = boot_params.e820_entries; sanitize_e820_map(boot_params.e820_map, ARRAY_SIZE(boot_params.e820_map), - &boot_params.e820_entries); + &new_nr); + boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { unsigned long mem_size; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index ab18457..1eb13b8 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -63,7 +63,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void add_memory_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); extern int -sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, char *pnr_map); +sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, unsigned new_type); -- cgit v1.1 From 5b7eb2e9ef4e467a1248537b47a63bab265be3cc Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:52 -0700 Subject: x86 boot: longer comment explaining sanitize_e820_map routine Elaborate on the comment for sanitize_e820_map(), epxlaining more what it does, what it inputs, and what it returns. Rearrange the placement of this comment to fit kernel conventions, before the routine's code rather than buried inside it. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 94 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 91abf5b..41c480a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -139,9 +139,64 @@ void __init e820_print_map(char *who) * Sanitize the BIOS e820 map. * * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. + * replaces the original e820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. * + * The input parameter biosmap points to an array of 'struct + * e820entry' which on entry has elements in the range [0, *pnr_map) + * valid, and which has space for up to max_nr_map entries. + * On return, the resulting sanitized e820 map entries will be in + * overwritten in the same location, starting at biosmap. + * + * The integer pointed to by pnr_map must be valid on entry (the + * current number of valid entries located at biosmap) and will + * be updated on return, with the new number of valid entries + * (something no more than max_nr_map.) + * + * The return value from sanitize_e820_map() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ */ + int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map) { @@ -162,43 +217,6 @@ static struct e820entry new_bios[E820_X_MAX] __initdata; int old_nr, new_nr, chg_nr; int i; - /* - Visually we're performing the following - (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - /* if there's only one memory region, don't bother */ if (*pnr_map < 2) return -1; -- cgit v1.1 From 69c9189320c46b14e5ae3ad4b3a0d35cc63cba20 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 14 May 2008 08:15:58 -0700 Subject: x86 boot: add code to add BIOS provided EFI memory entries to kernel Add to the kernels boot memory map 'memmap' entries found in the EFI memory descriptors passed in from the BIOS. On EFI systems, up to E820MAX == 128 memory map entries can be passed via the legacy E820 interface (limited by the size of the 'zeropage'). These entries can be duplicated in the EFI descriptors also passed from the BIOS, and possibly more entries passed by the EFI interface, which does not have the E820MAX limit on number of memory map entries. This code doesn't worry about the likely duplicate, overlapping or (unlikely) conflicting entries between the EFI map and the E820 map. It just dumps all the EFI entries into the memmap[] array (which already has the E820 entries) and lets the existing routine sanitize_e820_map() sort the mess out. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/efi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 77d424c..4a1a26d 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -213,6 +213,31 @@ unsigned long efi_get_time(void) eft.minute, eft.second); } +/* + * Tell the kernel about the EFI memory map. This might include + * more than the max 128 entries that can fit in the e820 legacy + * (zeropage) memory map. + */ + +static void __init add_efi_memmap(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + int e820_type; + + if (md->attribute & EFI_MEMORY_WB) + e820_type = E820_RAM; + else + e820_type = E820_RESERVED; + add_memory_region(start, size, e820_type); + } + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + #if EFI_DEBUG static void __init print_efi_memmap(void) { @@ -370,6 +395,7 @@ void __init efi_init(void) if (memmap.desc_size != sizeof(efi_memory_desc_t)) printk(KERN_WARNING "Kernel-defined memdesc" "doesn't match the one from EFI!\n"); + add_efi_memmap(); /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; -- cgit v1.1 From a4c81cf684350797939416c99effb9d3ae46bca6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 18 May 2008 01:18:57 -0700 Subject: x86: extend e820 ealy_res support 32bit move early_res related from e820_64.c to e820.c make edba detection to be done in head32.c remove smp_alloc_memory, because we have fixed trampoline address now. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 -------------------------------- arch/x86/kernel/head32.c | 76 ++++++++++++ arch/x86/kernel/setup_32.c | 109 +++--------------- arch/x86/kernel/smpboot.c | 17 -- arch/x86/kernel/trampoline.c | 2 arch/x86/mach-voyager/voyager_smp.c | 9 - include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 - include/asm-x86/smp.h | 1 10 files changed, 320 insertions(+), 319 deletions(-) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 214 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 196 --------------------------------- arch/x86/kernel/head32.c | 76 +++++++++++++ arch/x86/kernel/setup_32.c | 109 ++++-------------- arch/x86/kernel/smpboot.c | 17 --- arch/x86/kernel/trampoline.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 9 -- include/asm-x86/e820.h | 6 + include/asm-x86/e820_64.h | 9 -- include/asm-x86/smp.h | 1 - 10 files changed, 320 insertions(+), 319 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 41c480a..35da8cd 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -22,7 +22,9 @@ #include <asm/pgtable.h> #include <asm/page.h> #include <asm/e820.h> +#include <asm/proto.h> #include <asm/setup.h> +#include <asm/trampoline.h> struct e820map e820; @@ -493,3 +495,215 @@ __init void e820_setup_gap(void) pci_mem_start, gapstart, gapsize); } + +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + u64 start, end; + char name[16]; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, +#endif +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, + /* + * Has to be in very low memory so we can execute + * real-mode AP code. + */ + { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, +#endif + {} +}; + +void __init reserve_early(u64 start, u64 end, char *name) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name?name:"", r->start, + r->end - 1, r->name); + } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i, j; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (start == r->start && end == r->end) + break; + } + if (i >= MAX_EARLY_RES || !early_res[i].end) + panic("free_early on not reserved area: %llx-%llx!", + start, end); + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i; + u64 final_start, final_end; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) + continue; + printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, + final_start, final_end - 1, r->name); +#ifdef CONFIG_X86_64 + reserve_bootmem_generic(final_start, final_end - final_start); +#else + reserve_bootmem(final_start, final_end - final_start, + BOOTMEM_DEFAULT); +#endif + } +} + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + continue; + if (last > end) + continue; + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + return -1UL; + +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 354fbb2..07941b5 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -47,202 +47,6 @@ unsigned long max_pfn_mapped; static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; /* - * Early reserved memory areas. - */ -#define MAX_EARLY_RES 20 - -struct early_res { - unsigned long start, end; - char name[16]; -}; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#ifdef CONFIG_X86_TRAMPOLINE - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif - {} -}; - -void __init reserve_early(unsigned long start, unsigned long end, char *name) -{ - int i; - struct early_res *r; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", - start, end - 1, name?name:"", r->start, r->end - 1, r->name); - } - if (i >= MAX_EARLY_RES) - panic("Too many early reservations"); - r = &early_res[i]; - r->start = start; - r->end = end; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); -} - -void __init free_early(unsigned long start, unsigned long end) -{ - struct early_res *r; - int i, j; - - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (start == r->start && end == r->end) - break; - } - if (i >= MAX_EARLY_RES || !early_res[i].end) - panic("free_early on not reserved area: %lx-%lx!", start, end); - - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; -} - -void __init early_res_to_bootmem(unsigned long start, unsigned long end) -{ - int i; - unsigned long final_start, final_end; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) - continue; - printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, - final_start, final_end - 1, r->name); - reserve_bootmem_generic(final_start, final_end - final_start); - } -} - -/* Check for already reserved areas */ -static inline int __init -bad_addr(unsigned long *addrp, unsigned long size, unsigned long align) -{ - int i; - unsigned long addr = *addrp, last; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last >= r->start && addr < r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init -bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align) -{ - int i; - unsigned long addr = *addrp, last; - unsigned long size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, - unsigned long size, unsigned long align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr, last; - unsigned long ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) - continue; - return addr; - } - return -1UL; -} - -/* - * Find next free range after *start - */ -unsigned long __init find_e820_area_size(unsigned long start, - unsigned long *sizep, - unsigned long align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr, last; - unsigned long ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; - } - return -1UL; - -} -/* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3db0590..c216d3c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -8,7 +8,83 @@ #include <linux/init.h> #include <linux/start_kernel.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/e820.h> +#include <asm/bios_ebda.h> + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +static void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early(lowmem, 0x100000, "BIOS reserved"); +} + void __init i386_start_kernel(void) { + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + } +#endif + reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE"); + + reserve_ebda_region(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5faeab6..fed482c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -359,56 +359,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT); -} - #ifndef CONFIG_NEED_MULTIPLE_NODES static void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) @@ -522,25 +472,32 @@ static void __init reserve_initrd(void) unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; unsigned long ramdisk_here; - initrd_start = 0; - if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ + initrd_start = 0; + if (ramdisk_end < ramdisk_image) { + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd wraps around end of memory, " "disabling initrd\n"); return; } if (ramdisk_size >= end_of_lowmem/2) { + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; } + if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ - reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); + /* + * don't need to reserve again, already reserved early + * in i386_start_kernel, and early_res_to_bootmem + * convert that to reserved in bootmem + */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; return; @@ -582,6 +539,8 @@ static void __init relocate_initrd(void) p = (char *)__va(ramdisk_image); memcpy(q, p, clen); q += clen; + /* need to free these low pages...*/ + free_bootmem(ramdisk_image, clen); ramdisk_image += clen; ramdisk_size -= clen; } @@ -600,47 +559,28 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } + /* high pages is not converted by early_res_to_bootmem */ } #endif /* CONFIG_BLK_DEV_INITRD */ void __init setup_bootmem_allocator(void) { - unsigned long bootmap_size; + unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): */ - bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); - + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, + max_low_pfn<<PAGE_SHIFT, bootmap_size, + PAGE_SIZE); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n", bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); register_bootmem_low_pages(max_low_pfn); + early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); - /* - * Reserve the bootmem bitmap itself as well. We do this in two - * steps (first step was init_bootmem()) because this catches - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - */ - reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), - BOOTMEM_DEFAULT); - - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. - */ - reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); - - /* reserve EBDA region */ - reserve_ebda_region(); - -#ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); -#endif #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -803,9 +743,6 @@ void __init setup_arch(char **cmdline_p) * not to exceed the 8Mb limit. */ -#ifdef CONFIG_SMP - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ -#endif paging_init(); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3898849..843722e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -555,23 +555,6 @@ cpumask_t cpu_coregroup_map(int cpu) return c->llc_shared_map; } -#ifdef CONFIG_X86_32 -/* - * We are called very early to get the low memory for the - * SMP bootup trampoline page. - */ -void __init smp_alloc_memory(void) -{ - trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - if (__pa(trampoline_base) >= 0x9F000) - BUG(); -} -#endif - static void impress_friends(void) { int cpu; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index abbf199..1106fac 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,7 +2,7 @@ #include <asm/trampoline.h> -/* ready for x86_64, no harm for x86, since it will overwrite after alloc */ +/* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); /* diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 8acbf0c..7bbebbf 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1137,15 +1137,6 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, 0, 1, 1); } -/* used to set up the trampoline for other CPUs when the memory manager - * is sorted out */ -void __init smp_alloc_memory(void) -{ - trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); - if (__pa(trampoline_base) >= 0x93000) - BUG(); -} - /* send a reschedule CPI to one CPU by physical CPU number*/ static void voyager_smp_send_reschedule(int cpu) { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 1eb13b8..5a58e2b 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -70,6 +70,12 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, extern void update_e820(void); extern void e820_setup_gap(void); +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); +extern void reserve_early(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 9fac77e..37f176a 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -14,11 +14,6 @@ #include <linux/ioport.h> #ifndef __ASSEMBLY__ -extern unsigned long find_e820_area(unsigned long start, unsigned long end, - unsigned long size, unsigned long align); -extern unsigned long find_e820_area_size(unsigned long start, - unsigned long *sizep, - unsigned long align); extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); @@ -35,10 +30,6 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn, extern void finish_e820_parsing(void); -extern void reserve_early(unsigned long start, unsigned long end, char *name); -extern void free_early(unsigned long start, unsigned long end); -extern void early_res_to_bootmem(unsigned long start, unsigned long end); - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 1ebaa5c..514e52b 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -201,7 +201,6 @@ extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif -extern void smp_alloc_memory(void); extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); #endif /* __ASSEMBLY__ */ -- cgit v1.1 From ba5b14cc0311af6ed20dc25aa98a1d5ea6f2e6c0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 21 May 2008 18:40:18 -0700 Subject: x86: extend e820 ealy_res support 32bit - fix use find_e820_area to find addess for new RAMDISK, instead of using ram blindly also print out low ram and bootmap info Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/setup_32.c | 55 ++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index fed482c..3c451d1 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -466,11 +466,11 @@ static bool do_relocate_initrd = false; static void __init reserve_initrd(void) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - unsigned long ramdisk_here; + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 ramdisk_here; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -478,14 +478,8 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_end < ramdisk_image) { - free_bootmem(ramdisk_image, ramdisk_size); - printk(KERN_ERR "initrd wraps around end of memory, " - "disabling initrd\n"); - return; - } if (ramdisk_size >= end_of_lowmem/2) { - free_bootmem(ramdisk_image, ramdisk_size); + free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -495,8 +489,7 @@ static void __init reserve_initrd(void) /* All in lowmem, easy case */ /* * don't need to reserve again, already reserved early - * in i386_start_kernel, and early_res_to_bootmem - * convert that to reserved in bootmem + * in i386_start_kernel */ initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; @@ -504,11 +497,14 @@ static void __init reserve_initrd(void) } /* We need to move the initrd down into lowmem */ - ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT, + end_of_lowmem, ramdisk_size, + PAGE_SIZE); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size - 1, + "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -519,10 +515,10 @@ static void __init reserve_initrd(void) static void __init relocate_initrd(void) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - unsigned long ramdisk_here; + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -540,6 +536,8 @@ static void __init relocate_initrd(void) memcpy(q, p, clen); q += clen; /* need to free these low pages...*/ + printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n", + ramdisk_image, ramdisk_image + clen - 1); free_bootmem(ramdisk_image, clen); ramdisk_image += clen; ramdisk_size -= clen; @@ -560,6 +558,11 @@ static void __init relocate_initrd(void) ramdisk_size -= clen; } /* high pages is not converted by early_res_to_bootmem */ + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n", + ramdisk_image, ramdisk_image + ramdisk_size - 1, + ramdisk_here, ramdisk_here + ramdisk_size - 1); } #endif /* CONFIG_BLK_DEV_INITRD */ @@ -576,10 +579,17 @@ void __init setup_bootmem_allocator(void) PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); + reserve_early(bootmap, bootmap + bootmap_size - 1, "BOOTMAP"); +#ifdef CONFIG_BLK_DEV_INITRD + reserve_initrd(); +#endif bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " low ram: %08lx - %08lx\n", + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); + printk(KERN_INFO " bootmap [%08lx - %08lx]\n", + bootmap, bootmap + bootmap_size - 1); register_bootmem_low_pages(max_low_pfn); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); #ifdef CONFIG_ACPI_SLEEP /* @@ -593,9 +603,6 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif -#ifdef CONFIG_BLK_DEV_INITRD - reserve_initrd(); -#endif numa_kva_reserve(); reserve_crashkernel(); -- cgit v1.1 From 7f028bc0fd119a4fc65aedc07728ce85c8813d33 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: move mp_ioapic_routing to mpparse and make it static mpparse is the only user of mp_ioapic_routing. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 2 -- arch/x86/kernel/mpparse.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc..6f20fa9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -331,8 +331,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #ifdef CONFIG_X86_IO_APIC -struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 404683b..bb72133 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -1,5 +1,5 @@ /* - * Intel Multiprocessor Specification 1.1 and 1.4 +2 * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> @@ -805,7 +805,7 @@ int es7000_plat; #define MP_ISA_BUS 0 -extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; +static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; static int mp_find_ioapic(int gsi) { -- cgit v1.1 From a91eea6df383f26fc4fcbefcbdb5aee8facd17fd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: fix shadow variables of global end_pnf in e820_64.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_64.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 07941b5..5a9bc92 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -51,21 +51,21 @@ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn; + unsigned long last_pfn; - end_pfn = find_max_pfn_with_active_regions(); + last_pfn = find_max_pfn_with_active_regions(); - if (end_pfn > max_pfn_mapped) - max_pfn_mapped = end_pfn; + if (last_pfn > max_pfn_mapped) + max_pfn_mapped = last_pfn; if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) max_pfn_mapped = MAXMEM>>PAGE_SHIFT; - if (end_pfn > end_user_pfn) - end_pfn = end_user_pfn; - if (end_pfn > max_pfn_mapped) - end_pfn = max_pfn_mapped; + if (last_pfn > end_user_pfn) + last_pfn = end_user_pfn; + if (last_pfn > max_pfn_mapped) + last_pfn = max_pfn_mapped; printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); - return end_pfn; + return last_pfn; } /* @@ -124,12 +124,12 @@ void __init e820_mark_nosave_regions(void) } /* - * Finds an active region in the address range from start_pfn to end_pfn and + * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. */ static int __init e820_find_active_region(const struct e820entry *ei, unsigned long start_pfn, - unsigned long end_pfn, + unsigned long last_pfn, unsigned long *ei_startpfn, unsigned long *ei_endpfn) { @@ -146,14 +146,14 @@ static int __init e820_find_active_region(const struct e820entry *ei, /* Skip if map is outside the node */ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || - *ei_startpfn >= end_pfn) + *ei_startpfn >= last_pfn) return 0; /* Check for overlaps */ if (*ei_startpfn < start_pfn) *ei_startpfn = start_pfn; - if (*ei_endpfn > end_pfn) - *ei_endpfn = end_pfn; + if (*ei_endpfn > last_pfn) + *ei_endpfn = last_pfn; /* Obey end_user_pfn to save on memmap */ if (*ei_startpfn >= end_user_pfn) @@ -167,7 +167,7 @@ static int __init e820_find_active_region(const struct e820entry *ei, /* Walk the e820 map and register active regions within a node */ void __init e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long last_pfn) { unsigned long ei_startpfn; unsigned long ei_endpfn; @@ -175,7 +175,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn, for (i = 0; i < e820.nr_map; i++) if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, + start_pfn, last_pfn, &ei_startpfn, &ei_endpfn)) add_active_range(nid, ei_startpfn, ei_endpfn); } @@ -188,13 +188,13 @@ e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long __init e820_hole_size(unsigned long start, unsigned long end) { unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long last_pfn = end >> PAGE_SHIFT; unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { if (e820_find_active_region(&e820.map[i], - start_pfn, end_pfn, + start_pfn, last_pfn, &ei_startpfn, &ei_endpfn)) ram += ei_endpfn - ei_startpfn; } -- cgit v1.1 From 4a139a7fdec8964cecf7a3564ee7ae327141d495 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: include pci.h in e820_64.c global pci_mem_start needs a declaration. include pci.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 5a9bc92..3b2e0b1 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/suspend.h> #include <linux/pfn.h> +#include <linux/pci.h> #include <asm/pgtable.h> #include <asm/page.h> -- cgit v1.1 From 11a62a056093a7f25f1595fbd8bd5f93559572b6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Mon, 12 May 2008 15:43:38 +0200 Subject: x86: cleanup print out for mptable the new output is: MPTABLE: OEM ID: SUN MPTABLE: Product ID: 4600 M2 MPTABLE: APIC at: 0x instead of it all in one line with <6> and double Product ID... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/mpparse.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index bb72133..5a18b2b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -113,7 +113,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #ifdef CONFIG_X86_NUMAQ mpc_oem_bus_info(m, str, translation_table[mpc_record]); #else - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); #endif #if MAX_MP_BUSSES < 256 @@ -183,7 +183,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void __init MP_intsrc_info(struct mpc_config_intsrc *m) { mp_irqs[mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," + printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -196,7 +196,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, @@ -309,16 +309,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } memcpy(oem, mpc->mpc_oem, 8); oem[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); memcpy(str, mpc->mpc_productid, 12); str[12] = 0; - printk("Product ID: %s ", str); #ifdef CONFIG_X86_32 mps_oem_check(mpc, oem, str); #endif - printk(KERN_INFO "MPTABLE: Product ID: %s ", str); + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); @@ -689,7 +688,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); + printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { -- cgit v1.1 From 32c5061265caf201d6a2c0d02181e2b68769c22c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Wed, 14 May 2008 19:02:51 +0400 Subject: x86: move es7000_plat out of mpparse.c Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mpparse.c | 11 ++++++----- arch/x86/mach-es7000/es7000plat.c | 2 ++ include/asm-x86/system.h | 1 - 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5a18b2b..ff13423 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -793,15 +793,14 @@ void __init find_smp_config(void) ACPI-based MP Configuration -------------------------------------------------------------------------- */ -/* - * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: - */ -int es7000_plat; - #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +extern int es7000_plat; +#endif + #define MP_ISA_BUS 0 static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; @@ -928,11 +927,13 @@ void __init mp_config_acpi_legacy_irqs(void) set_bit(MP_ISA_BUS, mp_bus_not_pci); Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) /* * Older generations of ES7000 have no legacy identity mappings */ if (es7000_plat == 1) return; +#endif /* * Locate the IOAPIC that manages the ISA IRQs (0-15). diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c index f5d6f7d..a41c77a 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/mach-es7000/es7000plat.c @@ -52,6 +52,8 @@ static struct mip_reg *host_reg; static int mip_port; static unsigned long mip_addr, host_addr; +int es7000_plat; + /* * GSI override for ES7000 platforms. */ diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index a2f04cd..9f7f63b 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -303,7 +303,6 @@ static inline void clflush(volatile void *__p) void disable_hlt(void); void enable_hlt(void); -extern int es7000_plat; void cpu_idle_wait(void); extern unsigned long arch_align_stack(unsigned long sp); -- cgit v1.1 From 11113f84c72bd832dc4e76122e613b0e623e2346 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Wed, 14 May 2008 19:02:57 +0400 Subject: x86: complete move ACPI from mpparse.c Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 304 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/mpparse.c | 299 +------------------------------------------ 2 files changed, 305 insertions(+), 298 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6f20fa9..b2ad09c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -846,6 +846,310 @@ static int __init acpi_parse_madt_lapic_entries(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +extern int es7000_plat; +#endif + +static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; + +static int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +static u8 __init uniq_ioapic_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); +#ifdef CONFIG_X86_32 + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); +#else + mp_ioapics[idx].mpc_apicver = 0; +#endif + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; +} + +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity; + mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ + mp_irqs[mp_irq_entries].mpc_dstapic = + mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ + + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); + +} + +void __init mp_config_acpi_legacy_irqs(void) +{ + int i = 0; + int ioapic = -1; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + +#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; +#endif + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; +#ifdef CONFIG_X86_IO_APIC + mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; +#endif + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS + && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == + mp_irqs[mp_irq_entries].mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mpc_srcbusirq = i; /* Identity mapped */ + mp_irqs[mp_irq_entries].mpc_dstirq = i; + + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); + } +} + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic; + int ioapic_pin; +#ifdef CONFIG_X86_32 +#define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 + + static int pci_irq = IRQ_COMPRESSION_START; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; +#else + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; +#endif + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + +#ifdef CONFIG_X86_32 + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); +#endif + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + if (ioapic_pin > MP_MAX_IOAPIC_PIN) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); +#ifdef CONFIG_X86_32 + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); +#else + return gsi; +#endif + } + + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); +#ifdef CONFIG_X86_32 + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_gbl_FADT.sci_interrupt) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } +#endif + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ff13423..d05b70c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -1,5 +1,5 @@ /* -2 * Intel Multiprocessor Specification 1.1 and 1.4 + * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> @@ -788,300 +788,3 @@ void __init find_smp_config(void) { __find_smp_config(1); } - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_IO_APIC - -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) -extern int es7000_plat; -#endif - -#define MP_ISA_BUS 0 - -static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -static u8 __init uniq_ioapic_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].mpc_apicver = 0; -#endif - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - MP_intsrc_info(&intsrc); -} - -void __init mp_config_acpi_legacy_irqs(void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; -#endif - set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) - /* - * Older generations of ES7000 have no legacy identity mappings - */ - if (es7000_plat == 1) - return; -#endif - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; -#ifdef CONFIG_X86_IO_APIC - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; -#endif - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS - && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - MP_intsrc_info(&intsrc); - } -} - -int mp_register_gsi(u32 gsi, int triggering, int polarity) -{ - int ioapic; - int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 - - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; -#endif - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - -#ifdef CONFIG_X86_32 - if (ioapic_renumber_irq) - gsi = ioapic_renumber_irq(ioapic, gsi); -#endif - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - if (ioapic_pin > MP_MAX_IOAPIC_PIN) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else - return gsi; -#endif - } - - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ -- cgit v1.1 From 5f8951487ddbacbc949e9ffae574f94791f9b4dd Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Wed, 14 May 2008 19:03:04 +0400 Subject: x86: make mp_ioapic_routing definition local Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 7 ++++++- include/asm-x86/io_apic.h | 7 ------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2ad09c..f75c203 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -852,7 +852,12 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; +static struct { + int apic_id; + int gsi_base; + int gsi_end; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; static int mp_find_ioapic(int gsi) { diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index d593e14..2ef96f0 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -112,13 +112,6 @@ extern int nr_ioapic_registers[MAX_IO_APICS]; #define MP_MAX_IOAPIC_PIN 127 -struct mp_ioapic_routing { - int apic_id; - int gsi_base; - int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -}; - /* I/O APIC entries */ extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; -- cgit v1.1 From ec2cd0a22e2715f776a934e01c4f8ea098324fe1 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Wed, 14 May 2008 19:03:10 +0400 Subject: x86: make struct config_ioapic not MPspec specific Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 28 ++++++++++++------------ arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/io_apic_32.c | 52 ++++++++++++++++++++++---------------------- arch/x86/kernel/io_apic_64.c | 26 +++++++++++----------- arch/x86/kernel/mpparse.c | 8 +++++-- include/asm-x86/io_apic.h | 10 ++++++++- 6 files changed, 69 insertions(+), 57 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f75c203..a26dd91 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -887,8 +887,8 @@ static u8 __init uniq_ioapic_id(u8 id) DECLARE_BITMAP(used, 256); bitmap_zero(used, 256); for (i = 0; i < nr_ioapics; i++) { - struct mpc_config_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->mpc_apicid, used); + struct mp_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mp_apicid, used); } if (!test_bit(id, used)) return id; @@ -920,29 +920,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) idx = nr_ioapics; - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; + mp_ioapics[idx].mp_type = MP_IOAPIC; + mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mp_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); + mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); #ifdef CONFIG_X86_32 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); #else - mp_ioapics[idx].mpc_apicver = 0; + mp_ioapics[idx].mp_apicver = 0; #endif /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, + mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); nr_ioapics++; @@ -975,7 +975,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ mp_irqs[mp_irq_entries].mpc_dstapic = - mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + mp_ioapics[ioapic].mp_apicid; /* APIC ID */ mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ if (++mp_irq_entries == MAX_IRQ_SOURCES) @@ -1016,7 +1016,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; #ifdef CONFIG_X86_IO_APIC - mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid; #endif /* * Use the default configuration for the IRQs 0-15. Unless diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..fa1be1d 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1202,7 +1202,7 @@ void __init init_apic_mappings(void) for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; + ioapic_phys = mp_ioapics[i].mp_apicaddr; if (!ioapic_phys) { printk(KERN_ERR "WARNING: bogus zero IO-APIC " diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54f..5af1b71 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -72,7 +72,7 @@ int sis_apic_bug = -1; int nr_ioapic_registers[MAX_IO_APICS]; /* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ @@ -110,7 +110,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); } static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -802,7 +802,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && mp_irqs[i].mpc_dstirq == pin) return i; @@ -844,7 +844,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) return apic; } } @@ -872,7 +872,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) int lbus = mp_irqs[i].mpc_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; @@ -1250,12 +1250,12 @@ static void __init setup_IO_APIC_irqs(void) if (first_notcon) { apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mpc_apicid, + mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mpc_apicid, pin); + mp_ioapics[apic].mp_apicid, pin); continue; } @@ -1357,7 +1357,7 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -1376,7 +1376,7 @@ void __init print_IO_APIC(void) reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1749,14 +1749,14 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic].mpc_apicid; + old_id = mp_ioapics[apic].mp_apicid; - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mpc_apicid); + apic, mp_ioapics[apic].mp_apicid); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; } /* @@ -1765,9 +1765,9 @@ static void __init setup_ioapic_ids_from_mpc(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mpc_apicid)) { + mp_ioapics[apic].mp_apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mpc_apicid); + apic, mp_ioapics[apic].mp_apicid); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -1776,13 +1776,13 @@ static void __init setup_ioapic_ids_from_mpc(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic].mpc_apicid = i; + mp_ioapics[apic].mp_apicid = i; } else { physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic].mpc_apicid); + mp_ioapics[apic].mp_apicid); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -1791,11 +1791,11 @@ static void __init setup_ioapic_ids_from_mpc(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic].mpc_apicid) + if (old_id != mp_ioapics[apic].mp_apicid) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_dstapic == old_id) mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; + = mp_ioapics[apic].mp_apicid; /* * Read the right value from the MPC table and @@ -1803,9 +1803,9 @@ static void __init setup_ioapic_ids_from_mpc(void) */ apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); + mp_ioapics[apic].mp_apicid); - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0, reg_00.raw); spin_unlock_irqrestore(&ioapic_lock, flags); @@ -1816,7 +1816,7 @@ static void __init setup_ioapic_ids_from_mpc(void) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2355,8 +2355,8 @@ static int ioapic_resume(struct sys_device *dev) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2789,7 +2789,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, edge_level, active_high_low); ioapic_register_intr(irq, entry.vector, edge_level); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8df..4555ad8 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -104,7 +104,7 @@ DEFINE_SPINLOCK(vector_lock); int nr_ioapic_registers[MAX_IO_APICS]; /* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ @@ -140,7 +140,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); } static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) @@ -454,7 +454,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && mp_irqs[i].mpc_dstirq == pin) return i; @@ -496,7 +496,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) return apic; } } @@ -524,7 +524,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) int lbus = mp_irqs[i].mpc_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; @@ -846,7 +846,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, irq, trigger, polarity); /* @@ -887,10 +887,10 @@ static void __init setup_IO_APIC_irqs(void) idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); continue; } if (!first_notcon) { @@ -965,7 +965,7 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); /* * We are a bit conservative about what we expect. We have to @@ -983,7 +983,7 @@ void __apicdebuginit print_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1841,8 +1841,8 @@ static int ioapic_resume(struct sys_device *dev) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2336,7 +2336,7 @@ void __init ioapic_init_mappings(void) ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; + ioapic_phys = mp_ioapics[i].mp_apicaddr; } else { ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d05b70c..9f1e5bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -176,7 +176,11 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) if (bad_ioapic(m->mpc_apicaddr)) return; - mp_ioapics[nr_ioapics] = *m; + mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr; + mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid; + mp_ioapics[nr_ioapics].mp_type = m->mpc_type; + mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver; + mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags; nr_ioapics++; } @@ -426,7 +430,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqflag = 0; /* conforming */ intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid; intsrc.mpc_irqtype = mp_INT; diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 2ef96f0..ade76c0 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -112,8 +112,16 @@ extern int nr_ioapic_registers[MAX_IO_APICS]; #define MP_MAX_IOAPIC_PIN 127 +struct mp_config_ioapic { + unsigned long mp_apicaddr; + unsigned int mp_apicid; + unsigned char mp_type; + unsigned char mp_apicver; + unsigned char mp_flags; +}; + /* I/O APIC entries */ -extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; +extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; /* # of MP IRQ source entries */ extern int mp_irq_entries; -- cgit v1.1 From 2fddb6e28e903a3ab1704cc5aac01be5a59dc05b Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Wed, 14 May 2008 19:03:17 +0400 Subject: x86: make config_irqsrc not MPspec specific Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 43 +++++++++++++++-------------- arch/x86/kernel/io_apic_32.c | 64 ++++++++++++++++++++++---------------------- arch/x86/kernel/io_apic_64.c | 58 +++++++++++++++++++-------------------- arch/x86/kernel/mpparse.c | 8 +++++- include/asm-x86/io_apic.h | 12 ++++++++- 5 files changed, 100 insertions(+), 85 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a26dd91..276ec05 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -969,14 +969,14 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) if ((bus_irq == 0) && (trigger == 3)) trigger = 1; - mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mpc_irqflag = (trigger << 2) | polarity; - mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mpc_srcbusirq = bus_irq; /* IRQ */ - mp_irqs[mp_irq_entries].mpc_dstapic = + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity; + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq; /* IRQ */ + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ - mp_irqs[mp_irq_entries].mpc_dstirq = pin; /* INTIN# */ + mp_irqs[mp_irq_entries].mp_dstirq = pin; /* INTIN# */ if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); @@ -1012,12 +1012,11 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) return; - mp_irqs[mp_irq_entries].mpc_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mpc_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mpc_srcbus = MP_ISA_BUS; -#ifdef CONFIG_X86_IO_APIC - mp_irqs[mp_irq_entries].mpc_dstapic = mp_ioapics[ioapic].mp_apicid; -#endif + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; + /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. @@ -1026,17 +1025,17 @@ void __init mp_config_acpi_legacy_irqs(void) int idx; for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; + struct mp_config_intsrc *irq = mp_irqs + idx; /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS - && irq->mpc_srcbusirq == i) + if (irq->mp_srcbus == MP_ISA_BUS + && irq->mp_srcbusirq == i) break; /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == - mp_irqs[mp_irq_entries].mpc_dstapic) && - (irq->mpc_dstirq == i)) + if ((irq->mp_dstapic == + mp_irqs[mp_irq_entries].mp_dstapic) && + (irq->mp_dstirq == i)) break; } @@ -1045,9 +1044,9 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } - mp_irqs[mp_irq_entries].mpc_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mpc_srcbusirq = i; /* Identity mapped */ - mp_irqs[mp_irq_entries].mpc_dstirq = i; + mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; + mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ + mp_irqs[mp_irq_entries].mp_dstirq = i; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5af1b71..ea68c3e 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -76,7 +76,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; @@ -801,10 +801,10 @@ static int find_irq_entry(int apic, int pin, int type) int i; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) return i; return -1; @@ -818,13 +818,13 @@ static int __init find_isa_irq_pin(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) - return mp_irqs[i].mpc_dstirq; + return mp_irqs[i].mp_dstirq; } return -1; } @@ -834,17 +834,17 @@ static int __init find_isa_irq_apic(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } } @@ -869,23 +869,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; } for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) break; if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mpc_irqtype && + !mp_irqs[i].mp_irqtype && (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) return irq; /* * Use the first all-but-pin matching entry as a @@ -952,7 +952,7 @@ static int EISA_ELCR(unsigned int irq) * EISA conforming in the MP table, that means its trigger type must * be read in from the ELCR */ -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) /* PCI interrupts are always polarity one level triggered, @@ -969,13 +969,13 @@ static int EISA_ELCR(unsigned int irq) static int MPBIOS_polarity(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mpc_irqflag & 3) + switch (mp_irqs[idx].mp_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ { @@ -1012,13 +1012,13 @@ static int MPBIOS_polarity(int idx) static int MPBIOS_trigger(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ { @@ -1097,16 +1097,16 @@ static inline int irq_trigger(int idx) static int pin_2_irq(int idx, int apic, int pin) { int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ - if (mp_irqs[idx].mpc_dstirq != pin) + if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) - irq = mp_irqs[idx].mpc_srcbusirq; + irq = mp_irqs[idx].mp_srcbusirq; else { /* * PCI IRQs are mapped in order @@ -1793,8 +1793,8 @@ static void __init setup_ioapic_ids_from_mpc(void) */ if (old_id != mp_ioapics[apic].mp_apicid) for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic = mp_ioapics[apic].mp_apicid; /* @@ -2810,8 +2810,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return -1; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == mp_INT && - mp_irqs[i].mpc_srcbusirq == bus_irq) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) break; if (i >= mp_irq_entries) return -1; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 4555ad8..e7f1476 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -108,7 +108,7 @@ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; /* MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; @@ -453,10 +453,10 @@ static int find_irq_entry(int apic, int pin, int type) int i; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) return i; return -1; @@ -470,13 +470,13 @@ static int __init find_isa_irq_pin(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) - return mp_irqs[i].mpc_dstirq; + return mp_irqs[i].mp_dstirq; } return -1; } @@ -486,17 +486,17 @@ static int __init find_isa_irq_apic(int irq, int type) int i; for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } } @@ -521,23 +521,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; } for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; + int lbus = mp_irqs[i].mp_srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) break; if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mpc_irqtype && + !mp_irqs[i].mp_irqtype && (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) return irq; /* * Use the first all-but-pin matching entry as a @@ -565,13 +565,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) static int MPBIOS_polarity(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mpc_irqflag & 3) + switch (mp_irqs[idx].mp_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ if (test_bit(bus, mp_bus_not_pci)) @@ -607,13 +607,13 @@ static int MPBIOS_polarity(int idx) static int MPBIOS_trigger(int idx) { - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ if (test_bit(bus, mp_bus_not_pci)) @@ -660,16 +660,16 @@ static inline int irq_trigger(int idx) static int pin_2_irq(int idx, int apic, int pin) { int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; + int bus = mp_irqs[idx].mp_srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ - if (mp_irqs[idx].mpc_dstirq != pin) + if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mpc_srcbusirq; + irq = mp_irqs[idx].mp_srcbusirq; } else { /* * PCI IRQs are mapped in order @@ -2242,8 +2242,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return -1; for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == mp_INT && - mp_irqs[i].mpc_srcbusirq == bus_irq) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) break; if (i >= mp_irq_entries) return -1; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9f1e5bf..59f051d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -186,12 +186,18 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void __init MP_intsrc_info(struct mpc_config_intsrc *m) { - mp_irqs[mp_irq_entries] = *m; printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic; + mp_irqs[mp_irq_entries].mp_type = m->mpc_type; + mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype; + mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag; + mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus; + mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq; + mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index ade76c0..86d8c3b 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -120,6 +120,16 @@ struct mp_config_ioapic { unsigned char mp_flags; }; +struct mp_config_intsrc { + unsigned int mp_dstapic; + unsigned char mp_type; + unsigned char mp_irqtype; + unsigned short mp_irqflag; + unsigned char mp_srcbus; + unsigned char mp_srcbusirq; + unsigned char mp_dstirq; +}; + /* I/O APIC entries */ extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; @@ -127,7 +137,7 @@ extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; extern int mp_irq_entries; /* MP IRQ source entries */ -extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* non-0 if default (table-less) MP configuration */ extern int mpc_default_type; -- cgit v1.1 From 59f4519ad7ade61123e90085b0dadb2ea197bd87 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann <sandmann@daimi.au.dk> Date: Sun, 18 May 2008 05:24:41 +0200 Subject: x86: initialize all fields of mp_irqs[mp_irq_entries] Commit "x86: make config_irqsrc not MPspec specific" introduced some uses of uninitialized fields in mp_config_acpi_legacy_irqs(). I need the following patch to get sched-devel/master to boot. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/acpi/boot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 276ec05..6170c6a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1012,11 +1012,6 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) return; - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; - /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. @@ -1044,6 +1039,10 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ mp_irqs[mp_irq_entries].mp_dstirq = i; -- cgit v1.1 From aafbdf71f1d3aeffd679b1a86e1b28f71515856c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Thu, 22 May 2008 12:26:15 +0400 Subject: x86: fix mpparse/acpi interaction Sitsofe Wheeler reported boot problems on linux-next. It looks like the same issue as found by Soeren Sandman in 7575217f656a93, "x86: initialize all fields of mp_irqs[mp_irq_entries]". But his fix is also not complete, as dstapic is used before it assigned. Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com> Bisected-by: Sitsofe Wheeler <sitsofe@yahoo.com> Signed-off-by: Alexey Starikovskiy <astarikovskiy@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/acpi/boot.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6170c6a..2ddfaba 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1019,6 +1019,11 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; + for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; @@ -1039,10 +1044,6 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ mp_irqs[mp_irq_entries].mp_dstirq = i; -- cgit v1.1 From bf62f3981c7076714e3b9f5fa6989a806cad02bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 20 May 2008 20:10:58 -0700 Subject: x86: move e820_mark_nosave_regions to e820.c and make e820_mark_nosave_regions to take limit_pfn to use max_low_pfn for 32bit and end_pfn for 64bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/e820.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 32 -------------------------------- arch/x86/kernel/e820_64.c | 32 -------------------------------- arch/x86/kernel/setup_32.c | 2 +- arch/x86/kernel/setup_64.c | 2 +- include/asm-x86/e820.h | 9 +++++++++ include/asm-x86/e820_32.h | 8 -------- include/asm-x86/e820_64.h | 1 - 8 files changed, 43 insertions(+), 75 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 35da8cd..0cd9132 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/pfn.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -495,6 +496,37 @@ __init void e820_setup_gap(void) pci_mem_start, gapstart, gapsize); } +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +/** + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} +#endif /* * Early reserved memory areas. diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index dfe2575..db760d4 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/uaccess.h> -#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -208,37 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource, } } -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) -/** - * e820_mark_nosave_regions - Find the ranges of physical addresses that do not - * correspond to e820 RAM areas and mark the corresponding pages as nosave for - * hibernation. - * - * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. - */ -void __init e820_mark_nosave_regions(void) -{ - int i; - unsigned long pfn; - - pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); - for (i = 1; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (pfn < PFN_UP(ei->addr)) - register_nosave_region(pfn, PFN_UP(ei->addr)); - - pfn = PFN_DOWN(ei->addr + ei->size); - if (ei->type != E820_RAM) - register_nosave_region(PFN_UP(ei->addr), pfn); - - if (pfn >= max_low_pfn) - break; - } -} -#endif - /* * Find the highest page frame number we have available */ diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 3b2e0b1..5e063e7 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -17,7 +17,6 @@ #include <linux/kexec.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/suspend.h> #include <linux/pfn.h> #include <linux/pci.h> @@ -94,37 +93,6 @@ void __init e820_reserve_resources(void) } /* - * Find the ranges of physical addresses that do not correspond to - * e820 RAM areas and mark the corresponding pages as nosave for software - * suspend and suspend to RAM. - * - * This function requires the e820 map to be sorted and without any - * overlapping entries and assumes the first e820 area to be RAM. - */ -void __init e820_mark_nosave_regions(void) -{ - int i; - unsigned long paddr; - - paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); - for (i = 1; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - - if (paddr < ei->addr) - register_nosave_region(PFN_DOWN(paddr), - PFN_UP(ei->addr)); - - paddr = round_down(ei->addr + ei->size, PAGE_SIZE); - if (ei->type != E820_RAM) - register_nosave_region(PFN_UP(ei->addr), - PFN_DOWN(paddr)); - - if (paddr >= (end_pfn << PAGE_SHIFT)) - break; - } -} - -/* * Finds an active region in the address range from start_pfn to last_pfn and * returns its range in ei_startpfn and ei_endpfn for the e820 entry. */ diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 3c451d1..e1173aec 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -820,7 +820,7 @@ void __init setup_arch(char **cmdline_p) #endif e820_setup_gap(); - e820_mark_nosave_regions(); + e820_mark_nosave_regions(max_low_pfn); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff128..975a5da 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -516,7 +516,7 @@ void __init setup_arch(char **cmdline_p) * We trust e820 completely. No explicit ROM probing in memory. */ e820_reserve_resources(); - e820_mark_nosave_regions(); + e820_mark_nosave_regions(end_pfn); /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 5a58e2b..4266a2c 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -70,6 +70,15 @@ extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, extern void update_e820(void); extern void e820_setup_gap(void); +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +extern void e820_mark_nosave_regions(unsigned long limit_pfn); +#else +static inline void e820_mark_nosave_regions(unsigned long limit_pfn) +{ +} +#endif + extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 9576b43..7ace825 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -28,13 +28,5 @@ extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); -#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) -extern void e820_mark_nosave_regions(void); -#else -static inline void e820_mark_nosave_regions(void) -{ -} -#endif - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 37f176a..917963c 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -18,7 +18,6 @@ extern void setup_memory_region(void); extern void contig_e820_setup(void); extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); -extern void e820_mark_nosave_regions(void); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); -- cgit v1.1 From b3e2416465bc9140bfd209f247e6a789f68f0d19 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Fri, 23 May 2008 01:54:44 +0400 Subject: x86: Set pic_mode only if local apic code is present --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 59f051d..e61523a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -609,7 +609,7 @@ static void __init __get_smp_config(unsigned early) printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) if (mpf->mpf_feature2 & (1 << 7)) { printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); pic_mode = 1; -- cgit v1.1 From f3918352909f839a7b0dbf9b3f81d2e183c46f88 Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Fri, 23 May 2008 01:54:51 +0400 Subject: x86: move pic_mode to apic_32.c --- arch/x86/kernel/apic_32.c | 2 ++ arch/x86/kernel/mpparse.c | 2 -- arch/x86/mach-visws/mpparse.c | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index fa1be1d..848c603 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -76,6 +76,8 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); */ int apic_verbosity; +int pic_mode; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e61523a..b72c046 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -48,8 +48,6 @@ int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; static int mp_current_pci_id; -int pic_mode; - /* * Intel MP BIOS table parsing routines: */ diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index 57484e9..b7d063e 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -11,8 +11,6 @@ /* Have we found an MP table */ int smp_found_config; -int pic_mode; - extern unsigned int __cpuinitdata maxcpus; /* -- cgit v1.1 From bab4b27c00c4880737c18bb91138b1a7dd94164c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Mon, 19 May 2008 19:47:03 +0400 Subject: x86: move smp_found_config --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 3 +++ arch/x86/kernel/mpparse.c | 8 ++++---- arch/x86/mach-visws/mpparse.c | 5 ++--- arch/x86/mach-voyager/voyager_smp.c | 5 ----- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 848c603..c304759 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -78,6 +78,9 @@ int apic_verbosity; int pic_mode; +/* Have we found an MP table */ +int smp_found_config; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5910020..54087f9 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); */ int apic_verbosity; +/* Have we found an MP table */ +int smp_found_config; + static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b72c046..d67cd76 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -32,9 +32,6 @@ #include <mach_mpparse.h> #endif -/* Have we found an MP table */ -int smp_found_config; - /* * Various Linux-internal data structures created from the * MP-table. @@ -639,7 +636,9 @@ static void __init __get_smp_config(unsigned early) * override the defaults. */ if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; +#endif printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " @@ -706,8 +705,9 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, !mpf_checksum((unsigned char *)bp, 16) && ((mpf->mpf_specification == 1) || (mpf->mpf_specification == 4))) { - +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; +#endif mpf_found = mpf; #ifdef CONFIG_X86_32 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index b7d063e..a2fb78c 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -8,9 +8,6 @@ #include "cobalt.h" #include "mach_apic.h" -/* Have we found an MP table */ -int smp_found_config; - extern unsigned int __cpuinitdata maxcpus; /* @@ -74,7 +71,9 @@ void __init find_smp_config(void) if (ncpus > maxcpus) ncpus = maxcpus; +#ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; +#endif while (ncpus--) MP_processor_info(mp++); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 7bbebbf..8dedd01 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -59,11 +59,6 @@ __u32 voyager_quad_processors = 0; * activity count. Finally exported by i386_ksyms.c */ static int voyager_extended_cpus = 1; -/* Have we found an SMP box - used by time.c to do the profiling - interrupt for timeslicing; do not set to 1 until the per CPU timer - interrupt is active */ -int smp_found_config = 0; - /* Used for the invalidate map that's also checked in the spinlock */ static volatile unsigned long smp_invalidate_needed; -- cgit v1.1 From ce6444d39fdea29dcf145d2d95fe9cdc850aa53c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Mon, 19 May 2008 19:47:09 +0400 Subject: x86: mp_bus_id_to_pci_bus is not needed --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/io_apic_64.c | 2 +- arch/x86/kernel/mpparse.c | 10 ---------- include/asm-x86/mpspec.h | 2 -- 4 files changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ea68c3e..17b2e8f 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -864,7 +864,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " "slot:%d, pin:%d.\n", bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { + if (test_bit(bus, mp_bus_not_pci)) { printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index e7f1476..9637675 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -516,7 +516,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { + if (test_bit(bus, mp_bus_not_pci)) { apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d67cd76..83dfd66 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -41,13 +41,6 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; - -static int mp_current_pci_id; - -/* - * Intel MP BIOS table parsing routines: - */ /* * Checksum an MP configuration block. @@ -101,7 +94,6 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) static void __init MP_bus_info(struct mpc_config_bus *m) { char str[7]; - memcpy(str, m->mpc_bustype, 6); str[6] = 0; @@ -130,8 +122,6 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mpc_oem_pci_bus(m, translation_table[mpc_record]); #endif clear_bit(m->mpc_busid, mp_bus_not_pci); - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 57a991b..b785ddd 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -32,8 +32,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES]; - extern unsigned int boot_cpu_physical_apicid; extern int smp_found_config; extern int mpc_default_type; -- cgit v1.1 From 8732fc4b237fca3bd3cb0ec87ca8fb90271b0baf Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Mon, 19 May 2008 19:47:16 +0400 Subject: x86: move mp_bus_not_pci from mpparse.c --- arch/x86/kernel/io_apic_32.c | 6 ++++++ arch/x86/kernel/io_apic_64.c | 2 ++ arch/x86/kernel/mpparse.c | 10 ---------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 17b2e8f..2285b81 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -81,6 +81,12 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + static int disable_timer_pin_1 __initdata; /* diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9637675..339cf6f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -113,6 +113,8 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 83dfd66..e8e041e 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -33,16 +33,6 @@ #endif /* - * Various Linux-internal data structures created from the - * MP-table. - */ -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -/* * Checksum an MP configuration block. */ -- cgit v1.1 From 136ef671df04dc157afa0d4b96c7bd23ba072c9c Mon Sep 17 00:00:00 2001 From: Alexey Starikovskiy <astarikovskiy@suse.de> Date: Tue, 20 May 2008 00:29:59 +0400 Subject: x86: allow MPPARSE to be deselected in SMP configs --- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/setup_32.c | 2 +- arch/x86/kernel/setup_64.c | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ac1e31b..d5b364b 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -127,7 +127,7 @@ config 4KSTACKS config X86_FIND_SMP_CONFIG def_bool y - depends on X86_LOCAL_APIC || X86_VOYAGER + depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS depends on X86_32 config X86_MPPARSE diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index e1173aec..c04e8c9 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -814,7 +814,7 @@ void __init setup_arch(char **cmdline_p) "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); #endif #endif -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) if (smp_found_config) get_smp_config(); #endif diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 975a5da..89e6cca 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -456,10 +456,12 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_reserve_bootmem(); +#ifdef CONFIG_X86_MPPARSE /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); +#endif #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; @@ -502,11 +504,13 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); +#ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); +#endif init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.1 From b764a15f679942a7bc9d4f9645299e1defcc5b43 Mon Sep 17 00:00:00 2001 From: Alan Cox <alan@lxorguk.ukuu.org.uk> Date: Thu, 22 May 2008 21:22:04 +0100 Subject: x86: Switch apm to unlocked_kernel This pushes the lock a fair way down and the final kill looks like it should be an easy project for someone who wants to have a shot at it. Signed-off-by: Alan Cox <alan@redhat.com> Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/apm_32.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9290e..00e6d13 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,6 +228,7 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/smp_lock.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) as->event_tail = 0; } as->events[as->event_head] = event; - if ((!as->suser) || (!as->writer)) + if (!as->suser || !as->writer) continue; switch (event) { case APM_SYS_SUSPEND: @@ -1396,7 +1397,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { - if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { + if (as == NULL || as->magic != APM_BIOS_MAGIC) { printk(KERN_ERR "apm: %s passed bad filp\n", func); return 1; } @@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait) return 0; } -static int do_ioctl(struct inode *inode, struct file *filp, - u_int cmd, u_long arg) +static long do_ioctl(struct file *filp, u_int cmd, u_long arg) { struct apm_user *as; + int ret; as = filp->private_data; if (check_apm_user(as, "ioctl")) return -EIO; - if ((!as->suser) || (!as->writer)) + if (!as->suser || !as->writer) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: + lock_kernel(); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp, queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); + unlock_kernel(); break; case APM_IOC_SUSPEND: + lock_kernel(); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp, } else queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { - return suspend(1); + ret = suspend(1); } else { as->suspend_wait = 1; wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); - return as->suspend_result; + ret = as->suspend_result; } - break; + unlock_kernel(); + return ret; default: - return -EINVAL; + return -ENOTTY; } return 0; } @@ -1860,7 +1865,7 @@ static const struct file_operations apm_bios_fops = { .owner = THIS_MODULE, .read = do_read, .poll = do_poll, - .ioctl = do_ioctl, + .unlocked_ioctl = do_ioctl, .open = do_open, .release = do_release, }; -- cgit v1.1 From 2e4db9d2c5db9c9dd3e1d1ec3263bc4a990ff8df Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp> Date: Mon, 19 May 2008 14:23:44 -0700 Subject: x86: remove duplicate declaration of unknown_nmi_panic Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/nmi.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1e36302..8455bf3 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -46,7 +46,6 @@ extern void nmi_watchdog_default(void); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -extern int unknown_nmi_panic; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); -- cgit v1.1 From 85cc35fa7255d113b5383a9c8536c363274bb475 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sun, 25 May 2008 21:21:36 +0200 Subject: x86: fix mpparse fallout UP builds with LOCAL_APIC=y and IO_APIC=n fail with a missing reference to mp_bus_not_pci. Distangle the mpparse code some more and move the ioapic specific bus check into a separate function. This code needs sume urgent un#ifdef surgery all over the place. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/mpparse.c | 73 ++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e8e041e..9f3792d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -81,6 +81,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) generic_processor_info(apicid, m->mpc_apicver); } +#ifdef CONFIG_X86_IO_APIC static void __init MP_bus_info(struct mpc_config_bus *m) { char str[7]; @@ -122,6 +123,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } +#endif #ifdef CONFIG_X86_IO_APIC @@ -336,7 +338,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) { struct mpc_config_bus *m = (struct mpc_config_bus *)mpt; +#ifdef CONFIG_X86_IO_APIC MP_bus_info(m); +#endif mpt += sizeof(*m); count += sizeof(*m); break; @@ -472,40 +476,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) MP_intsrc_info(&intsrc); } -#endif -static inline void __init construct_default_ISA_mptable(int mpc_default_type) +static void construct_ioapic_table(int mpc_default_type) { - struct mpc_config_processor processor; - struct mpc_config_bus bus; -#ifdef CONFIG_X86_IO_APIC struct mpc_config_ioapic ioapic; -#endif - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } + struct mpc_config_bus bus; bus.mpc_type = MP_BUS; bus.mpc_busid = 0; @@ -534,7 +509,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) MP_bus_info(&bus); } -#ifdef CONFIG_X86_IO_APIC ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; @@ -546,7 +520,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * We set up most of the low 16 IO-APIC pins according to MPS rules. */ construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void construct_ioapic_table(int mpc_default_type) { } #endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + lintsrc.mpc_type = MP_LINTSRC; lintsrc.mpc_irqflag = 0; /* conforming */ lintsrc.mpc_srcbusid = 0; -- cgit v1.1 From ddca03c98a7f7ad5ab09967ff52d4ed60358c896 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:31 +0400 Subject: x86: nmi - unify die_nmi() interface By slightly changing 32bit mode die_nmi() we may unify the interface and make it common for both (32/64bit) modes Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 7 +++---- arch/x86/kernel/traps_32.c | 8 +++++--- include/asm-x86/nmi.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 69bdae5..bd04a28 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -320,8 +320,6 @@ void touch_nmi_watchdog(void) } EXPORT_SYMBOL(touch_nmi_watchdog); -extern void die_nmi(struct pt_regs *, const char *msg); - notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { @@ -375,7 +373,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, 0); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -406,7 +405,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); + die_nmi(buf, regs, 0); return 0; } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bde6f63..f31e665 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -755,9 +755,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_SPINLOCK(nmi_print_lock); -void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { - if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; spin_lock(&nmi_print_lock); @@ -766,10 +766,12 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) * to get a message out: */ bust_spinlocks(1); - printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 8455bf3..7cd5b6a 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,12 +38,12 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); -extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern void nmi_watchdog_default(void); #else #define nmi_watchdog_default() do {} while (0) #endif +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -- cgit v1.1 From e56b3a12c45e439169f2d1f7194be397667320a6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:32 +0400 Subject: x86: nmi - die_nmi() output message unification Make 64bit die_nmi() to produce the same message as 32bit mode has Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_64.c | 4 ++-- arch/x86/kernel/traps_64.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 33cb8ec..1a98705 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -358,8 +358,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, - panic_on_timeout); + die_nmi("NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index adff76e..6a048ce 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -614,7 +614,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, smp_processor_id()); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); -- cgit v1.1 From c6425b9f143a75bbcd0a7684b4df40e20d0b2f32 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:33 +0400 Subject: x86: move do_nmi(), stop_nmi() and restart_nmi() to traps_64.c traps_32.c already holds these functions so do the same for traps_64.c Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_64.c | 25 ------------------------- arch/x86/kernel/traps_64.c | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 1a98705..c1ea671 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -30,7 +30,6 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -int panic_on_unrecovered_nmi; static cpumask_t backtrace_mask = CPU_MASK_NONE; @@ -383,30 +382,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) return rc; } -static unsigned ignore_nmis; - -asmlinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - add_pda(__nmi_count,1); - if (!ignore_nmis) - default_do_nmi(regs); - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 6a048ce..e4a3807 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -76,7 +76,9 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +int panic_on_unrecovered_nmi; static unsigned int code_bytes = 64; +static unsigned ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) { @@ -867,6 +869,28 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) io_check_error(reason, regs); } +asmlinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + add_pda(__nmi_count, 1); + if (!ignore_nmis) + default_do_nmi(regs); + nmi_exit(); +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + /* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) { -- cgit v1.1 From d1b946b97d71423f365fa797d1428e1847c0bec1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:34 +0400 Subject: x86: nmi_32.c - add "panic" option Allow to pass "panic" option in 32bit mode Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index bd04a28..4437fe1 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -42,6 +42,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; @@ -140,6 +141,14 @@ static int __init setup_nmi_watchdog(char *str) { int nmi; + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + get_option(&str, &nmi); if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) @@ -374,7 +383,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * die_nmi will return ONLY if NOTIFY_STOP happens.. */ die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, 0); + regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); -- cgit v1.1 From 4b82b277707a39b97271439c475f186f63ec4692 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:35 +0400 Subject: x86: nmi_32.c - add nmi_watchdog_default helper Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 19 +++++++++++++------ include/asm-x86/nmi.h | 4 +--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 4437fe1..7714e84 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,6 +51,17 @@ static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata = 0; +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -437,12 +448,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return -EIO; } - if (nmi_watchdog == NMI_DEFAULT) { - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; - } + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 7cd5b6a..1e8f34d 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,11 +38,9 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); -extern void nmi_watchdog_default(void); -#else -#define nmi_watchdog_default() do {} while (0) #endif +extern void nmi_watchdog_default(void); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -- cgit v1.1 From ad63ba169d45ccb6594eb35a6c31c421fe70ff45 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:36 +0400 Subject: x86: nmi_32/64.c - use apic_write_around instead of apic_write apic_write_around will be expanded to apic_write in 64bit mode anyway. Only a few CPUs (well, old CPUs to be precise) requires such an action. In general it should not hurt and could be cleaned up for apic_write (just in case) Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 2 +- arch/x86/kernel/nmi_64.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 7714e84..ec5842b 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -248,7 +248,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index c1ea671..f546e31 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -215,7 +215,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI); + apic_write_around(APIC_LVT0, APIC_DM_NMI); } /* @@ -229,7 +229,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* -- cgit v1.1 From 6c8decdf14b17d42f6eb817d310642f4d6598a19 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:37 +0400 Subject: x86: nmi_32.c - unknown_nmi_panic_callback should always panic Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index ec5842b..d0b0684 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -425,7 +425,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 0); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } -- cgit v1.1 From 96f9dcb10755e96eae706b9e869c0dc25adb3d74 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:38 +0400 Subject: x86: nmi_64.c - use for_each_possible_cpu helper Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index f546e31..d98b21c 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -98,7 +98,7 @@ int __init check_nmi_watchdog(void) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); #endif - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_cpu(cpu) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks -- cgit v1.1 From 7c2ba83f9a479eee6f302147767a30f3187fbd4b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:39 +0400 Subject: x86: nmi_32.c cleanup - use for_each_online_cpu helper Since cpu_online_map is touched (by for_each_online_cpu) at moment when cpu_callin_map is already filled up we can get rid of its checking at all Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index d0b0684..c6d0777 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -108,13 +108,7 @@ int __init check_nmi_watchdog(void) local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks - for_each_possible_cpu(cpu) { -#ifdef CONFIG_SMP - /* Check cpu_callin_map here because that is set - after the timer is started. */ - if (!cpu_isset(cpu, cpu_callin_map)) - continue; -#endif + for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { -- cgit v1.1 From fd5cea02de100197a4c26d9e103508cf09b50a82 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:40 +0400 Subject: x86: nmi_32/64.c - add helper functions to hide arch specific data Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_32.c | 44 ++++++++++++++++++++++++++++++++------------ arch/x86/kernel/nmi_64.c | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index c6d0777..a94dbed 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,6 +51,26 @@ static DEFINE_PER_CPU(short, wd_enabled); static int endflag __initdata = 0; +static inline unsigned int get_nmi_count(int cpu) +{ + return nmi_count(cpu); +} + +static inline int mce_in_progress(void) +{ + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +} + /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -104,19 +124,19 @@ int __init check_nmi_watchdog(void) #endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = nmi_count(cpu); + prev_nmi_count[cpu] = get_nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], - nmi_count(cpu)); + get_nmi_count(cpu)); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } @@ -355,6 +375,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + if (cpu_isset(cpu, backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ @@ -365,16 +392,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } - /* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ - sum = per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) touched = 1; - } /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index d98b21c..c680244 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -47,6 +47,30 @@ static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); +static int endflag __initdata = 0; + +static inline unsigned int get_nmi_count(int cpu) +{ + return cpu_pda(cpu)->__nmi_count; +} + +static inline int mce_in_progress(void) +{ +#ifdef CONFIG_X86_MCE + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +} + /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -55,8 +79,6 @@ void nmi_watchdog_default(void) nmi_watchdog = NMI_NONE; } -static int endflag __initdata = 0; - #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -99,19 +121,19 @@ int __init check_nmi_watchdog(void) #endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; + prev_nmi_count[cpu] = get_nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], - cpu_pda(cpu)->__nmi_count); + get_nmi_count(cpu)); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } @@ -327,7 +349,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } - sum = read_pda(apic_timer_irqs) + read_pda(irq0_irqs); + sum = get_timer_irqs(cpu); + if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -343,12 +366,9 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) cpu_clear(cpu, backtrace_mask); } -#ifdef CONFIG_X86_MCE - /* Could check oops_in_progress here too, but it's safer - not too */ - if (atomic_read(&mce_entry) > 0) + if (mce_in_progress()) touched = 1; -#endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* -- cgit v1.1 From 1798bc22b2790bf2a956588e6b17c36ef79ceff7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 24 May 2008 19:36:41 +0400 Subject: x86: nmi_32/64.c - merge down nmi_32.c and nmi_64.c to nmi.c Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: hpa@zytor.com Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/nmi.c | 533 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/nmi_32.c | 506 -------------------------------------------- arch/x86/kernel/nmi_64.c | 477 ------------------------------------------ 4 files changed, 534 insertions(+), 984 deletions(-) create mode 100644 arch/x86/kernel/nmi.c delete mode 100644 arch/x86/kernel/nmi_32.c delete mode 100644 arch/x86/kernel/nmi_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..d5dd666 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 0000000..69a839f --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,533 @@ +/* + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar <mingo@redhat.com> + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include <linux/nmi.h> +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/sysctl.h> +#include <linux/percpu.h> +#include <linux/kprobes.h> +#include <linux/cpumask.h> +#include <linux/kernel_stat.h> +#include <linux/kdebug.h> + +#include <asm/smp.h> +#include <asm/nmi.h> +#include <asm/proto.h> +#include <asm/timer.h> + +#include <asm/mce.h> + +#include <mach_traps.h> + +int unknown_nmi_panic; +int nmi_watchdog_enabled; + +static cpumask_t backtrace_mask = CPU_MASK_NONE; + +/* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +static int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; + +static unsigned int nmi_hz = HZ; +static DEFINE_PER_CPU(short, wd_enabled); +static int endflag __initdata = 0; + +static inline unsigned int get_nmi_count(int cpu) +{ +#ifdef CONFIG_X86_64 + return cpu_pda(cpu)->__nmi_count; +#else + return nmi_count(cpu); +#endif +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ +#ifdef CONFIG_X86_64 + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +#else + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +#endif +} + +/* Run after command line and cpu_init init, but before all other checks */ +void nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; +#ifdef CONFIG_X86_64 + nmi_watchdog = NMI_NONE; +#else + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +#endif +} + +#ifdef CONFIG_SMP +/* + * The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) +{ + local_irq_enable_in_hardirq(); + /* + * Intentionally don't use cpu_relax here. This is + * to make sure that the performance counter really ticks, + * even if there is a simulator or similar that catches the + * pause instruction. On a real HT machine this is fine because + * all other CPUs are busy with "useless" delay loops and don't + * care if they get somewhat less cycles. + */ + while (endflag == 0) + mb(); +} +#endif + +int __init check_nmi_watchdog(void) +{ + unsigned int *prev_nmi_count; + int cpu; + + if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) + goto error; + + printk(KERN_INFO "Testing NMI watchdog ... "); + +#ifdef CONFIG_SMP + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif + + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = get_nmi_count(cpu); + local_irq_enable(); + mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ + + for_each_online_cpu(cpu) { + if (!per_cpu(wd_enabled, cpu)) + continue; + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk(KERN_WARNING "WARNING: CPU#%d: NMI " + "appears to be stuck (%d->%d)!\n", + cpu, + prev_nmi_count[cpu], + get_nmi_count(cpu)); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + endflag = 1; + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + goto error; + } + printk("OK.\n"); + + /* + * now that we know it works we can reduce NMI frequency to + * something more reasonable; makes a difference in some configs + */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(prev_nmi_count); + return 0; + +error: +#ifdef CONFIG_X86_32 + timer_ack = !cpu_has_tsc; +#endif + return -1; +} + +static int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID || nmi < NMI_NONE) + return 0; + + nmi_watchdog = nmi; + return 1; +} +__setup("nmi_watchdog=", setup_nmi_watchdog); + +/* + * Suspend/resume support + */ +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; +} + +static struct sysdev_class nmi_sysclass = { + .name = "lapic_nmi", + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + /* + * should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if (atomic_read(&nmi_active) < 0) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} + +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + if (__get_cpu_var(wd_enabled)) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + /* enable it before to avoid race with handler */ + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } +} + +void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if (nmi_watchdog != NMI_LOCAL_APIC && + nmi_watchdog != NMI_IO_APIC) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up here too!] + */ + +static DEFINE_PER_CPU(unsigned, last_irq_sum); +static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(int, nmi_touch); + +void touch_nmi_watchdog(void) +{ + if (nmi_watchdog > 0) { + unsigned cpu; + + /* + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. + */ + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } + } + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +notrace __kprobes int +nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +{ + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + unsigned int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc = 0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + touched = 1; + + /* if the none of the timers isn't firing, this cpu isn't doing much */ + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } else { + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); + } + + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* + * don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ + return 0; +} + +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + printk(KERN_WARNING + "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ + +int do_nmi_callback(struct pt_regs *regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); + diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c deleted file mode 100644 index a94dbed..0000000 --- a/arch/x86/kernel/nmi_32.c +++ /dev/null @@ -1,506 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kernel_stat.h> -#include <linux/kdebug.h> -#include <linux/slab.h> - -#include <asm/smp.h> -#include <asm/nmi.h> -#include <asm/timer.h> - -#include "mach_traps.h" - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return nmi_count(cpu); -} - -static inline int mce_in_progress(void) -{ - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - timer_ack = !cpu_has_tsc; - - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - - -/* Suspend/resume support */ - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up - * here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c deleted file mode 100644 index c680244..0000000 --- a/arch/x86/kernel/nmi_64.c +++ /dev/null @@ -1,477 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kdebug.h> - -#include <asm/smp.h> -#include <asm/nmi.h> -#include <asm/proto.h> -#include <asm/mce.h> - -#include <mach_traps.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -static cpumask_t backtrace_mask = CPU_MASK_NONE; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; - -static DEFINE_PER_CPU(short, wd_enabled); - -static int endflag __initdata = 0; - -static inline unsigned int get_nmi_count(int cpu) -{ - return cpu_pda(cpu)->__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#ifdef CONFIG_X86_MCE - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); -} - -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - nmi_watchdog = NMI_NONE; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (endflag == 0) - mb(); -} -#endif - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) - return 0; - - if (!atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - return -1; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20*1000)/nmi_hz); // wait 20 ticks - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - return -1; - } - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -} - -static int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str,"panic",5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) - return 0; - - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - __get_cpu_var(wd_enabled) = 1; - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog > 0) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - if (cpu_isset(cpu, backtrace_mask)) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); - dump_stack(); - spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); - } - - if (mce_in_progress()) - touched = 1; - - /* if the apic timer isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { - printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else { - printk( KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void __trigger_all_cpu_backtrace(void) -{ - int i; - - backtrace_mask = cpu_online_map; - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) - break; - mdelay(1); - } -} - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); -- cgit v1.1 From 4b6011bc6ec71660859139ac8d28b7d0badd681c Mon Sep 17 00:00:00 2001 From: Sven Wegener <sven.wegener@stealer.net> Date: Sun, 25 May 2008 21:16:36 +0200 Subject: x86: Remove obsolete LOCK macro from include/asm-x86/atomic_64.h Commit d167a518 "[PATCH] x86_64: x86_64 version of the smp alternative patch." has left the LOCK macro in include/asm-x86_64/atomic.h, which is now include/asm-x86/atomic_64.h. Its scope should be local to the file, other architectures don't provide it, I couldn't find an in-tree user of it and allyesconfig, allmodconfig and allnoconfig build fine without it, so this patch removes it. Signed-off-by: Sven Wegener <sven.wegener@stealer.net> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/atomic_64.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/asm-x86/atomic_64.h b/include/asm-x86/atomic_64.h index 3e0cd7d..fe589c1 100644 --- a/include/asm-x86/atomic_64.h +++ b/include/asm-x86/atomic_64.h @@ -11,12 +11,6 @@ * resource counting etc.. */ -#ifdef CONFIG_SMP -#define LOCK "lock ; " -#else -#define LOCK "" -#endif - /* * Make sure gcc doesn't try to be clever and move things around * on us. We need to use _exactly_ the address the user gave us, -- cgit v1.1 From 1a20d3ecf5c2a6435df2b756435b1eb1c657d45b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Mon, 26 May 2008 13:36:53 -0700 Subject: x86: string_32.h: workaround for broken gcc 4.0 gcc 4.0 fails to allocate %eax for the pattern operand in the rep store instructions used by memset; force it to do so by declaring a register variable. Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- include/asm-x86/string_32.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/string_32.h b/include/asm-x86/string_32.h index 8d0c593..193578c 100644 --- a/include/asm-x86/string_32.h +++ b/include/asm-x86/string_32.h @@ -267,11 +267,18 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, asm volatile("rep ; stosl" \ x \ : "=&c" (d0), "=&D" (d1) \ - : "a" (pattern), "0" (count/4), "1" ((long)s) \ + : "a" (eax), "0" (count/4), "1" ((long)s) \ : "memory") { int d0, d1; +#if __GNUC__ == 4 && __GNUC_MINOR__ == 0 + /* Workaround for broken gcc 4.0 */ + register unsigned long eax asm("%eax") = pattern; +#else + unsigned long eax = pattern; +#endif + switch (count % 4) { case 0: COMMON(""); -- cgit v1.1 From 4226ab93d8ae3fd895abe45879fe34d489a98718 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:30:58 +0100 Subject: x86: use pteval_t for _PAGE_FOO Rather than making _PAGE_* constants signed, and then relying on sign-extension to make sure that masks derived from them are wide enough, just explicitly type them pteval_t. This guarantees that they and any derived values are the right size for the current pte format. The reliance on sign extension is fragile, and invokes some very subtle corners of the C type system. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/pgtable.h | 49 +++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 97c271b..fab3ecb 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -20,30 +20,25 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a - * sign-extended value on 32-bit with all 1's in the upper word, - * which preserves the upper pte values on 64-bit ptes: - */ -#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT) -#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW) -#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER) -#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT) -#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD) -#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED) -#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY) -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */ -#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */ -#define _PAGE_UNUSED1 (_AC(1, L)<<_PAGE_BIT_UNUSED1) -#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2) -#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3) -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT) -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE) +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_UNUSED2 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED2) +#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else -#define _PAGE_NX 0 +#define _PAGE_NX (_AT(pteval_t, 0)) #endif /* If _PAGE_PRESENT is clear, we use these: */ @@ -210,22 +205,22 @@ static inline int pmd_large(pmd_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); + return __pte(pte_val(pte) & ~_PAGE_DIRTY); } static inline pte_t pte_mkold(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } static inline pte_t pte_wrprotect(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_RW); + return __pte(pte_val(pte) & ~_PAGE_RW); } static inline pte_t pte_mkexec(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_NX); + return __pte(pte_val(pte) & ~_PAGE_NX); } static inline pte_t pte_mkdirty(pte_t pte) @@ -250,7 +245,7 @@ static inline pte_t pte_mkhuge(pte_t pte) static inline pte_t pte_clrhuge(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_PSE); + return __pte(pte_val(pte) & ~_PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte) @@ -260,7 +255,7 @@ static inline pte_t pte_mkglobal(pte_t pte) static inline pte_t pte_clrglobal(pte_t pte) { - return __pte(pte_val(pte) & ~(pteval_t)_PAGE_GLOBAL); + return __pte(pte_val(pte) & ~_PAGE_GLOBAL); } static inline pte_t pte_mkspecial(pte_t pte) -- cgit v1.1 From 4e09e21ccb0dfe7ee8d5641192e0072e83bd916b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:03 +0100 Subject: x86: use symbolic constant in stts() Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/asm-x86/system.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index a2f04cd..7e4c133 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -289,7 +289,7 @@ static inline void native_wbinvd(void) #endif/* CONFIG_PARAVIRT */ -#define stts() write_cr0(8 | read_cr0()) +#define stts() write_cr0(read_cr0() | X86_CR0_TS) #endif /* __KERNEL__ */ -- cgit v1.1 From 0acf10d8fbd52926217d3933d196b33fe2468f18 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:30:59 +0100 Subject: xen: add raw console write functions for debug Add a couple of functions which can write directly to the Xen console for debugging. This output ends up on the host's dom0 console (assuming it allows the domain to write there). Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/char/hvc_xen.c | 26 ++++++++++++++++++++++++++ include/xen/hvc-console.h | 3 +++ 2 files changed, 29 insertions(+) diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c index dd68f85..e97d9d1 100644 --- a/drivers/char/hvc_xen.c +++ b/drivers/char/hvc_xen.c @@ -157,3 +157,29 @@ struct console xenboot_console = { .write = xenboot_write_console, .flags = CON_PRINTBUFFER | CON_BOOT, }; + +void xen_raw_console_write(const char *str) +{ + int len = strlen(str); + + while(len > 0) { + int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); + if (rc <= 0) + break; + + str += rc; + len -= rc; + } +} + +void xen_raw_printk(const char *fmt, ...) +{ + static char buf[512]; + va_list ap; + + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + xen_raw_console_write(buf); +} diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index 21c0ecf..efc3237 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -3,4 +3,7 @@ extern struct console xenboot_console; +void xen_raw_console_write(const char *str); +void xen_raw_printk(const char *fmt, ...); + #endif /* XEN_HVC_CONSOLE_H */ -- cgit v1.1 From 0922abdc3982ae54cbe1b24ac5aa91a260eca1bb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:00 +0100 Subject: xen: make early console also write to debug console When using "earlyprintk=xen", also write the console output to the raw debug console. This will appear on dom0's console if the hypervisor has been compiled to allow it. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/char/hvc_xen.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c index e97d9d1..2413af3 100644 --- a/drivers/char/hvc_xen.c +++ b/drivers/char/hvc_xen.c @@ -134,12 +134,27 @@ module_init(xen_init); module_exit(xen_fini); console_initcall(xen_cons_init); +static void raw_console_write(const char *str, int len) +{ + while(len > 0) { + int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); + if (rc <= 0) + break; + + str += rc; + len -= rc; + } +} + +#ifdef CONFIG_EARLY_PRINTK static void xenboot_write_console(struct console *console, const char *string, unsigned len) { unsigned int linelen, off = 0; const char *pos; + raw_console_write(string, len); + while (off < len && NULL != (pos = strchr(string+off, '\n'))) { linelen = pos-string+off; if (off + linelen > len) @@ -155,21 +170,13 @@ static void xenboot_write_console(struct console *console, const char *string, struct console xenboot_console = { .name = "xenboot", .write = xenboot_write_console, - .flags = CON_PRINTBUFFER | CON_BOOT, + .flags = CON_PRINTBUFFER | CON_BOOT | CON_ANYTIME, }; +#endif /* CONFIG_EARLY_PRINTK */ void xen_raw_console_write(const char *str) { - int len = strlen(str); - - while(len > 0) { - int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); - if (rc <= 0) - break; - - str += rc; - len -= rc; - } + raw_console_write(str, strlen(str)); } void xen_raw_printk(const char *fmt, ...) -- cgit v1.1 From 7b1333aa4cb546ddeb9c05098a53d9a777623a05 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:01 +0100 Subject: xen: use hypercall rather than clts Xen will trap and emulate clts, but its better to use a hypercall. Also, xenner doesn't handle clts. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 28 ++++++++++++++++++++++++++-- include/asm-x86/xen/hypercall.h | 7 +++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a05b772..446f4cd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -607,6 +607,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, xen_mc_issue(PARAVIRT_LAZY_MMU); } +static void xen_clts(void) +{ + struct multicall_space mcs; + + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_write_cr0(unsigned long cr0) +{ + struct multicall_space mcs; + + /* Only pay attention to cr0.TS; everything else is + ignored. */ + mcs = xen_mc_entry(0); + + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + static void xen_write_cr2(unsigned long cr2) { x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; @@ -978,10 +1002,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = native_clts, + .clts = xen_clts, .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, + .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h index c2ccd99..897ff79 100644 --- a/include/asm-x86/xen/hypercall.h +++ b/include/asm-x86/xen/hypercall.h @@ -315,6 +315,13 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) } static inline void +MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) +{ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = set; +} + +static inline void MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, pte_t new_val, unsigned long flags) { -- cgit v1.1 From 349c709f42453707f74bece0d9d35ee5b3842893 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:02 +0100 Subject: xen: use new sched_op Use the new sched_op hypercall, mainly because xenner doesn't support the old one. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 6 ++++-- include/asm-x86/xen/hypercall.h | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 446f4cd..35ddaf5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -254,7 +254,7 @@ static void xen_irq_enable(void) static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) BUG(); } @@ -1138,11 +1138,13 @@ static const struct smp_ops xen_smp_ops __initdata = { static void xen_reboot(int reason) { + struct sched_shutdown r = { .reason = reason }; + #ifdef CONFIG_SMP smp_send_stop(); #endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h index 897ff79..2a4f9b4 100644 --- a/include/asm-x86/xen/hypercall.h +++ b/include/asm-x86/xen/hypercall.h @@ -176,9 +176,9 @@ HYPERVISOR_fpu_taskswitch(int set) } static inline int -HYPERVISOR_sched_op(int cmd, unsigned long arg) +HYPERVISOR_sched_op(int cmd, void *arg) { - return _hypercall2(int, sched_op, cmd, arg); + return _hypercall2(int, sched_op_new, cmd, arg); } static inline long -- cgit v1.1 From 2956a3511c8c5dccb1d4739ead17c7c3c23a24b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:04 +0100 Subject: xen: allow some cr4 updates The guest can legitimately change things like cr4.OSFXSR and OSXMMEXCPT, so let it. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 35ddaf5..f687648 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -648,8 +648,10 @@ static unsigned long xen_read_cr2_direct(void) static void xen_write_cr4(unsigned long cr4) { - /* Just ignore cr4 changes; Xen doesn't allow us to do - anything anyway. */ + cr4 &= ~X86_CR4_PGE; + cr4 &= ~X86_CR4_PSE; + + native_write_cr4(cr4); } static unsigned long xen_read_cr3(void) -- cgit v1.1 From 239d1fc04ed0b58d638096b12a7f6d50269d30c9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:05 +0100 Subject: xen: don't worry about preempt during xen_irq_enable() When enabling interrupts, we don't need to worry about preemption, because we either enter with interrupts disabled - so no preemption - or the caller is confused and is re-enabling interrupts on some indeterminate processor. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f687648..4a372b7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -235,13 +235,13 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + vcpu = x86_read_percpu(xen_vcpu); vcpu->evtchn_upcall_mask = 0; - preempt_enable_no_resched(); /* Doesn't matter if we get preempted here, because any pending event will get dealt with anyway. */ -- cgit v1.1 From a15af1c9ea2750a9ff01e51615c45950bad8221b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:06 +0100 Subject: x86/paravirt: add pte_flags to just get pte flags Add pte_flags() to extract the flags from a pte. This is a special case of pte_val() which is only guaranteed to return the pte's flags correctly; the page number may be corrupted or missing. The intent is to allow paravirt implementations to return pte flags without having to do any translation of the page number (most notably, Xen). Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/paravirt.c | 1 + arch/x86/xen/enlighten.c | 1 + drivers/lguest/lg.h | 1 - include/asm-x86/page.h | 1 + include/asm-x86/paravirt.h | 15 +++++++++++++++ include/asm-x86/pgtable.h | 16 ++++++++-------- 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 74f0c5e..c98d546 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -403,6 +403,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, + .pte_flags = native_pte_val, .pgd_val = native_pgd_val, .make_pte = native_make_pte, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4a372b7..1b4b5fa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1101,6 +1101,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pmd = xen_set_pmd, .pte_val = xen_pte_val, + .pte_flags = native_pte_val, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 005bd04..5faefea 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -136,7 +136,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * first step in the migration to the kernel types. pte_pfn is already defined * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) -#define pte_flags(x) (pte_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index dc936dd..a1e2b94 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -160,6 +160,7 @@ static inline pteval_t native_pte_val(pte_t pte) #endif #define pte_val(x) native_pte_val(x) +#define pte_flags(x) native_pte_val(x) #define __pte(x) native_make_pte(x) #endif /* CONFIG_PARAVIRT */ diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 0f13b94..5ea37a4 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -239,6 +239,7 @@ struct pv_mmu_ops { unsigned long addr, pte_t *ptep); pteval_t (*pte_val)(pte_t); + pteval_t (*pte_flags)(pte_t); pte_t (*make_pte)(pteval_t pte); pgdval_t (*pgd_val)(pgd_t); @@ -996,6 +997,20 @@ static inline pteval_t pte_val(pte_t pte) return ret; } +static inline pteval_t pte_flags(pte_t pte) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags, + pte.pte, (u64)pte.pte >> 32); + else + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, + pte.pte); + + return ret; +} + static inline pgd_t __pgd(pgdval_t val) { pgdval_t ret; diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 97c271b..47a852c 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -164,37 +164,37 @@ extern struct list_head pgd_list; */ static inline int pte_dirty(pte_t pte) { - return pte_val(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { - return pte_val(pte) & _PAGE_ACCESSED; + return pte_flags(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { - return pte_val(pte) & _PAGE_RW; + return pte_flags(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { - return pte_val(pte) & _PAGE_FILE; + return pte_flags(pte) & _PAGE_FILE; } static inline int pte_huge(pte_t pte) { - return pte_val(pte) & _PAGE_PSE; + return pte_flags(pte) & _PAGE_PSE; } static inline int pte_global(pte_t pte) { - return pte_val(pte) & _PAGE_GLOBAL; + return pte_flags(pte) & _PAGE_GLOBAL; } static inline int pte_exec(pte_t pte) { - return !(pte_val(pte) & _PAGE_NX); + return !(pte_flags(pte) & _PAGE_NX); } static inline int pte_special(pte_t pte) @@ -305,7 +305,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_val(x) & ~PTE_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x) & ~PTE_MASK) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) -- cgit v1.1 From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001 From: Markus Armbruster <armbru@redhat.com> Date: Mon, 26 May 2008 23:31:07 +0100 Subject: xen: Enable console tty by default in domU if it's not a dummy Without console= arguments on the kernel command line, the first console to register becomes enabled and the preferred console (the one behind /dev/console). This is normally tty (assuming CONFIG_VT_CONSOLE is enabled, which it commonly is). This is okay as long tty is a useful console. But unless we have the PV framebuffer, and it is enabled for this domain, tty0 in domU is merely a dummy. In that case, we want the preferred console to be the Xen console hvc0, and we want it without having to fiddle with the kernel command line. Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7 did that for us. Since we now have the PV framebuffer, we want to enable and prefer tty again, but only when PVFB is enabled. But even then we still want to enable the Xen console as well. Problem: when tty registers, we can't yet know whether the PVFB is enabled. By the time we can know (xenstore is up), the console setup game is over. Solution: enable console tty by default, but keep hvc as the preferred console. Change the preferred console to tty when PVFB probes successfully, unless we've been given console kernel parameters. Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 4 +++- drivers/video/xen-fbfront.c | 25 +++++++++++++++++++++++++ include/linux/console.h | 2 ++ kernel/printk.c | 3 +++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1b4b5fa..6cfb708 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1256,8 +1256,10 @@ asmlinkage void __init xen_start_kernel(void) ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; - if (!is_initial_xendomain()) + if (!is_initial_xendomain()) { + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } /* Start the world */ start_kernel(); diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 619a6f8..4e10876 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -18,6 +18,7 @@ * frame buffer. */ +#include <linux/console.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fb.h> @@ -48,6 +49,7 @@ struct xenfb_info { static u32 xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8; +static void xenfb_make_preferred_console(void); static int xenfb_remove(struct xenbus_device *); static void xenfb_init_shared_page(struct xenfb_info *); static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *); @@ -348,6 +350,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, if (ret < 0) goto error; + xenfb_make_preferred_console(); return 0; error_nomem: @@ -358,6 +361,28 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, return ret; } +static __devinit void +xenfb_make_preferred_console(void) +{ + struct console *c; + + if (console_set_on_cmdline) + return; + + acquire_console_sem(); + for (c = console_drivers; c; c = c->next) { + if (!strcmp(c->name, "tty") && c->index == 0) + break; + } + release_console_sem(); + if (c) { + unregister_console(c); + c->flags |= CON_CONSDEV; + c->flags &= ~CON_PRINTBUFFER; /* don't print again */ + register_console(c); + } +} + static int xenfb_resume(struct xenbus_device *dev) { struct xenfb_info *info = dev->dev.driver_data; diff --git a/include/linux/console.h b/include/linux/console.h index a4f27fb..248e6e3 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -108,6 +108,8 @@ struct console { struct console *next; }; +extern int console_set_on_cmdline; + extern int add_preferred_console(char *name, int idx, char *options); extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options); extern void register_console(struct console *); diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c3..028ed75 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -121,6 +121,8 @@ struct console_cmdline static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int selected_console = -1; static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -890,6 +892,7 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); -- cgit v1.1 From 6ba0e7b36c7cc1745b3cbeda244d14edae3ad058 Mon Sep 17 00:00:00 2001 From: Markus Armbruster <armbru@redhat.com> Date: Mon, 26 May 2008 23:31:08 +0100 Subject: xen pvfb: Pointer z-axis (mouse wheel) support Add z-axis motion to pointer events. Backward compatible, because there's space for the z-axis in union xenkbd_in_event, and old backends zero it. Derived from http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/57dfe0098000 http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/1edfea26a2a9 http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/c3ff0b26f664 Signed-off-by: Pat Campbell <plc@novell.com> Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/input/xen-kbdfront.c | 8 +++++++- include/xen/interface/io/kbdif.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c index 0f47f46..9da4452 100644 --- a/drivers/input/xen-kbdfront.c +++ b/drivers/input/xen-kbdfront.c @@ -66,6 +66,9 @@ static irqreturn_t input_handler(int rq, void *dev_id) case XENKBD_TYPE_MOTION: input_report_rel(dev, REL_X, event->motion.rel_x); input_report_rel(dev, REL_Y, event->motion.rel_y); + if (event->motion.rel_z) + input_report_rel(dev, REL_WHEEL, + -event->motion.rel_z); break; case XENKBD_TYPE_KEY: dev = NULL; @@ -84,6 +87,9 @@ static irqreturn_t input_handler(int rq, void *dev_id) case XENKBD_TYPE_POS: input_report_abs(dev, ABS_X, event->pos.abs_x); input_report_abs(dev, ABS_Y, event->pos.abs_y); + if (event->pos.rel_z) + input_report_rel(dev, REL_WHEEL, + -event->pos.rel_z); break; } if (dev) @@ -152,7 +158,7 @@ static int __devinit xenkbd_probe(struct xenbus_device *dev, ptr->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS); for (i = BTN_LEFT; i <= BTN_TASK; i++) set_bit(i, ptr->keybit); - ptr->relbit[0] = BIT(REL_X) | BIT(REL_Y); + ptr->relbit[0] = BIT(REL_X) | BIT(REL_Y) | BIT(REL_WHEEL); input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0); input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0); diff --git a/include/xen/interface/io/kbdif.h b/include/xen/interface/io/kbdif.h index fb97f42..8066c78 100644 --- a/include/xen/interface/io/kbdif.h +++ b/include/xen/interface/io/kbdif.h @@ -49,6 +49,7 @@ struct xenkbd_motion { uint8_t type; /* XENKBD_TYPE_MOTION */ int32_t rel_x; /* relative X motion */ int32_t rel_y; /* relative Y motion */ + int32_t rel_z; /* relative Z motion (wheel) */ }; struct xenkbd_key { @@ -61,6 +62,7 @@ struct xenkbd_position { uint8_t type; /* XENKBD_TYPE_POS */ int32_t abs_x; /* absolute X position (in FB pixels) */ int32_t abs_y; /* absolute Y position (in FB pixels) */ + int32_t rel_z; /* relative Z motion (wheel) */ }; #define XENKBD_IN_EVENT_SIZE 40 -- cgit v1.1 From 1e892c959da42278e60b21f5ecfd6fba0efff313 Mon Sep 17 00:00:00 2001 From: Markus Armbruster <armbru@redhat.com> Date: Mon, 26 May 2008 23:31:09 +0100 Subject: xen pvfb: Module aliases to support module autoloading These are mostly for completeness and consistency with the other frontends, as PVFB is typically compiled in rather than a module. Derived from http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/5e294e29a43e While there, add module descriptions. Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/input/xen-kbdfront.c | 2 ++ drivers/video/xen-fbfront.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c index 9da4452..eaf69cf 100644 --- a/drivers/input/xen-kbdfront.c +++ b/drivers/input/xen-kbdfront.c @@ -343,4 +343,6 @@ static void __exit xenkbd_cleanup(void) module_init(xenkbd_init); module_exit(xenkbd_cleanup); +MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("xen:vkbd"); diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 4e10876..5553e51 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -572,4 +572,6 @@ static void __exit xenfb_cleanup(void) module_init(xenfb_init); module_exit(xenfb_cleanup); +MODULE_DESCRIPTION("Xen virtual framebuffer device frontend"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("xen:vfb"); -- cgit v1.1 From f4ad1ebd7a0fae2782ef9f76c0b94b536742c3e8 Mon Sep 17 00:00:00 2001 From: Markus Armbruster <armbru@redhat.com> Date: Mon, 26 May 2008 23:31:10 +0100 Subject: xen pvfb: Zero unused bytes in events sent to backend This isn't a security flaw (the backend can see all our memory anyway). But it's the right thing to do all the same. Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/video/xen-fbfront.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 5553e51..291eef6 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -61,6 +61,7 @@ static void xenfb_do_update(struct xenfb_info *info, union xenfb_out_event event; u32 prod; + memset(&event, 0, sizeof(event)); event.type = XENFB_TYPE_UPDATE; event.update.x = x; event.update.y = y; -- cgit v1.1 From e4dcff1f6e7582f76c2c9990b1d9111bbc8e26ef Mon Sep 17 00:00:00 2001 From: Markus Armbruster <armbru@redhat.com> Date: Mon, 26 May 2008 23:31:11 +0100 Subject: xen pvfb: Dynamic mode support (screen resizing) The pvfb backend indicates dynamic mode support by creating node feature_resize with a non-zero value in its xenstore directory. xen-fbfront sends a resize notification event on mode change. Fully backwards compatible both ways. Framebuffer size and initial resolution can be controlled through kernel parameter xen_fbfront.video. The backend enforces a separate size limit, which it advertises in node videoram in its xenstore directory. xen-kbdfront gets the maximum screen resolution from nodes width and height in the backend's xenstore directory instead of hardcoding it. Additional goodie: support for larger framebuffers (512M on a 64-bit system with 4K pages). Changing the number of bits per pixels dynamically is not supported, yet. Ported from http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/92f7b3144f41 http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/bfc040135633 Signed-off-by: Pat Campbell <plc@novell.com> Signed-off-by: Markus Armbruster <armbru@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/input/xen-kbdfront.c | 10 +++ drivers/video/xen-fbfront.c | 183 +++++++++++++++++++++++++++++++++------- include/xen/interface/io/fbif.h | 29 +++++-- 3 files changed, 188 insertions(+), 34 deletions(-) diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c index eaf69cf..9ce3b3b 100644 --- a/drivers/input/xen-kbdfront.c +++ b/drivers/input/xen-kbdfront.c @@ -300,6 +300,16 @@ InitWait: */ if (dev->state != XenbusStateConnected) goto InitWait; /* no InitWait seen yet, fudge it */ + + /* Set input abs params to match backend screen res */ + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "width", "%d", &val) > 0) + input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0); + + if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "height", "%d", &val) > 0) + input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0); + break; case XenbusStateClosing: diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 291eef6..47ed39b 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -43,23 +43,47 @@ struct xenfb_info { struct xenfb_page *page; unsigned long *mfns; int update_wanted; /* XENFB_TYPE_UPDATE wanted */ + int feature_resize; /* XENFB_TYPE_RESIZE ok */ + struct xenfb_resize resize; /* protected by resize_lock */ + int resize_dpy; /* ditto */ + spinlock_t resize_lock; struct xenbus_device *xbdev; }; -static u32 xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8; +#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8) + +enum { KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT }; +static int video[KPARAM_CNT] = { 2, XENFB_WIDTH, XENFB_HEIGHT }; +module_param_array(video, int, NULL, 0); +MODULE_PARM_DESC(video, + "Video memory size in MB, width, height in pixels (default 2,800,600)"); static void xenfb_make_preferred_console(void); static int xenfb_remove(struct xenbus_device *); -static void xenfb_init_shared_page(struct xenfb_info *); +static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *); static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *); static void xenfb_disconnect_backend(struct xenfb_info *); +static void xenfb_send_event(struct xenfb_info *info, + union xenfb_out_event *event) +{ + u32 prod; + + prod = info->page->out_prod; + /* caller ensures !xenfb_queue_full() */ + mb(); /* ensure ring space available */ + XENFB_OUT_RING_REF(info->page, prod) = *event; + wmb(); /* ensure ring contents visible */ + info->page->out_prod = prod + 1; + + notify_remote_via_irq(info->irq); +} + static void xenfb_do_update(struct xenfb_info *info, int x, int y, int w, int h) { union xenfb_out_event event; - u32 prod; memset(&event, 0, sizeof(event)); event.type = XENFB_TYPE_UPDATE; @@ -68,14 +92,19 @@ static void xenfb_do_update(struct xenfb_info *info, event.update.width = w; event.update.height = h; - prod = info->page->out_prod; /* caller ensures !xenfb_queue_full() */ - mb(); /* ensure ring space available */ - XENFB_OUT_RING_REF(info->page, prod) = event; - wmb(); /* ensure ring contents visible */ - info->page->out_prod = prod + 1; + xenfb_send_event(info, &event); +} - notify_remote_via_irq(info->irq); +static void xenfb_do_resize(struct xenfb_info *info) +{ + union xenfb_out_event event; + + memset(&event, 0, sizeof(event)); + event.resize = info->resize; + + /* caller ensures !xenfb_queue_full() */ + xenfb_send_event(info, &event); } static int xenfb_queue_full(struct xenfb_info *info) @@ -87,12 +116,28 @@ static int xenfb_queue_full(struct xenfb_info *info) return prod - cons == XENFB_OUT_RING_LEN; } +static void xenfb_handle_resize_dpy(struct xenfb_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&info->resize_lock, flags); + if (info->resize_dpy) { + if (!xenfb_queue_full(info)) { + info->resize_dpy = 0; + xenfb_do_resize(info); + } + } + spin_unlock_irqrestore(&info->resize_lock, flags); +} + static void xenfb_refresh(struct xenfb_info *info, int x1, int y1, int w, int h) { unsigned long flags; - int y2 = y1 + h - 1; int x2 = x1 + w - 1; + int y2 = y1 + h - 1; + + xenfb_handle_resize_dpy(info); if (!info->update_wanted) return; @@ -225,6 +270,57 @@ static ssize_t xenfb_write(struct fb_info *p, const char __user *buf, return res; } +static int +xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) +{ + struct xenfb_info *xenfb_info; + int required_mem_len; + + xenfb_info = info->par; + + if (!xenfb_info->feature_resize) { + if (var->xres == video[KPARAM_WIDTH] && + var->yres == video[KPARAM_HEIGHT] && + var->bits_per_pixel == xenfb_info->page->depth) { + return 0; + } + return -EINVAL; + } + + /* Can't resize past initial width and height */ + if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT]) + return -EINVAL; + + required_mem_len = var->xres * var->yres * xenfb_info->page->depth / 8; + if (var->bits_per_pixel == xenfb_info->page->depth && + var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) && + required_mem_len <= info->fix.smem_len) { + var->xres_virtual = var->xres; + var->yres_virtual = var->yres; + return 0; + } + return -EINVAL; +} + +static int xenfb_set_par(struct fb_info *info) +{ + struct xenfb_info *xenfb_info; + unsigned long flags; + + xenfb_info = info->par; + + spin_lock_irqsave(&xenfb_info->resize_lock, flags); + xenfb_info->resize.type = XENFB_TYPE_RESIZE; + xenfb_info->resize.width = info->var.xres; + xenfb_info->resize.height = info->var.yres; + xenfb_info->resize.stride = info->fix.line_length; + xenfb_info->resize.depth = info->var.bits_per_pixel; + xenfb_info->resize.offset = 0; + xenfb_info->resize_dpy = 1; + spin_unlock_irqrestore(&xenfb_info->resize_lock, flags); + return 0; +} + static struct fb_ops xenfb_fb_ops = { .owner = THIS_MODULE, .fb_read = fb_sys_read, @@ -233,6 +329,8 @@ static struct fb_ops xenfb_fb_ops = { .fb_fillrect = xenfb_fillrect, .fb_copyarea = xenfb_copyarea, .fb_imageblit = xenfb_imageblit, + .fb_check_var = xenfb_check_var, + .fb_set_par = xenfb_set_par, }; static irqreturn_t xenfb_event_handler(int rq, void *dev_id) @@ -261,6 +359,8 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, { struct xenfb_info *info; struct fb_info *fb_info; + int fb_size; + int val; int ret; info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -268,18 +368,35 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); return -ENOMEM; } + + /* Limit kernel param videoram amount to what is in xenstore */ + if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) { + if (val < video[KPARAM_MEM]) + video[KPARAM_MEM] = val; + } + + /* If requested res does not fit in available memory, use default */ + fb_size = video[KPARAM_MEM] * 1024 * 1024; + if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH / 8 + > fb_size) { + video[KPARAM_WIDTH] = XENFB_WIDTH; + video[KPARAM_HEIGHT] = XENFB_HEIGHT; + fb_size = XENFB_DEFAULT_FB_LEN; + } + dev->dev.driver_data = info; info->xbdev = dev; info->irq = -1; info->x1 = info->y1 = INT_MAX; spin_lock_init(&info->dirty_lock); + spin_lock_init(&info->resize_lock); - info->fb = vmalloc(xenfb_mem_len); + info->fb = vmalloc(fb_size); if (info->fb == NULL) goto error_nomem; - memset(info->fb, 0, xenfb_mem_len); + memset(info->fb, 0, fb_size); - info->nr_pages = (xenfb_mem_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT; info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages); if (!info->mfns) @@ -290,8 +407,6 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, if (!info->page) goto error_nomem; - xenfb_init_shared_page(info); - /* abusing framebuffer_alloc() to allocate pseudo_palette */ fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL); if (fb_info == NULL) @@ -304,9 +419,9 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, fb_info->screen_base = info->fb; fb_info->fbops = &xenfb_fb_ops; - fb_info->var.xres_virtual = fb_info->var.xres = info->page->width; - fb_info->var.yres_virtual = fb_info->var.yres = info->page->height; - fb_info->var.bits_per_pixel = info->page->depth; + fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH]; + fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT]; + fb_info->var.bits_per_pixel = XENFB_DEPTH; fb_info->var.red = (struct fb_bitfield){16, 8, 0}; fb_info->var.green = (struct fb_bitfield){8, 8, 0}; @@ -318,9 +433,9 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, fb_info->var.vmode = FB_VMODE_NONINTERLACED; fb_info->fix.visual = FB_VISUAL_TRUECOLOR; - fb_info->fix.line_length = info->page->line_length; + fb_info->fix.line_length = fb_info->var.xres * XENFB_DEPTH / 8; fb_info->fix.smem_start = 0; - fb_info->fix.smem_len = xenfb_mem_len; + fb_info->fix.smem_len = fb_size; strcpy(fb_info->fix.id, "xen"); fb_info->fix.type = FB_TYPE_PACKED_PIXELS; fb_info->fix.accel = FB_ACCEL_NONE; @@ -337,6 +452,8 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, fb_info->fbdefio = &xenfb_defio; fb_deferred_io_init(fb_info); + xenfb_init_shared_page(info, fb_info); + ret = register_framebuffer(fb_info); if (ret) { fb_deferred_io_cleanup(fb_info); @@ -389,7 +506,7 @@ static int xenfb_resume(struct xenbus_device *dev) struct xenfb_info *info = dev->dev.driver_data; xenfb_disconnect_backend(info); - xenfb_init_shared_page(info); + xenfb_init_shared_page(info, info->fb_info); return xenfb_connect_backend(dev, info); } @@ -417,20 +534,23 @@ static unsigned long vmalloc_to_mfn(void *address) return pfn_to_mfn(vmalloc_to_pfn(address)); } -static void xenfb_init_shared_page(struct xenfb_info *info) +static void xenfb_init_shared_page(struct xenfb_info *info, + struct fb_info *fb_info) { int i; + int epd = PAGE_SIZE / sizeof(info->mfns[0]); for (i = 0; i < info->nr_pages; i++) info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE); - info->page->pd[0] = vmalloc_to_mfn(info->mfns); - info->page->pd[1] = 0; - info->page->width = XENFB_WIDTH; - info->page->height = XENFB_HEIGHT; - info->page->depth = XENFB_DEPTH; - info->page->line_length = (info->page->depth / 8) * info->page->width; - info->page->mem_length = xenfb_mem_len; + for (i = 0; i * epd < info->nr_pages; i++) + info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]); + + info->page->width = fb_info->var.xres; + info->page->height = fb_info->var.yres; + info->page->depth = fb_info->var.bits_per_pixel; + info->page->line_length = fb_info->fix.line_length; + info->page->mem_length = fb_info->fix.smem_len; info->page->in_cons = info->page->in_prod = 0; info->page->out_cons = info->page->out_prod = 0; } @@ -530,6 +650,11 @@ InitWait: val = 0; if (val) info->update_wanted = 1; + + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-resize", "%d", &val) < 0) + val = 0; + info->feature_resize = val; break; case XenbusStateClosing: diff --git a/include/xen/interface/io/fbif.h b/include/xen/interface/io/fbif.h index 5a934dd..974a51e 100644 --- a/include/xen/interface/io/fbif.h +++ b/include/xen/interface/io/fbif.h @@ -49,11 +49,27 @@ struct xenfb_update { int32_t height; /* rect height */ }; +/* + * Framebuffer resize notification event + * Capable backend sets feature-resize in xenstore. + */ +#define XENFB_TYPE_RESIZE 3 + +struct xenfb_resize { + uint8_t type; /* XENFB_TYPE_RESIZE */ + int32_t width; /* width in pixels */ + int32_t height; /* height in pixels */ + int32_t stride; /* stride in bytes */ + int32_t depth; /* depth in bits */ + int32_t offset; /* start offset within framebuffer */ +}; + #define XENFB_OUT_EVENT_SIZE 40 union xenfb_out_event { uint8_t type; struct xenfb_update update; + struct xenfb_resize resize; char pad[XENFB_OUT_EVENT_SIZE]; }; @@ -105,15 +121,18 @@ struct xenfb_page { * Each directory page holds PAGE_SIZE / sizeof(*pd) * framebuffer pages, and can thus map up to PAGE_SIZE * * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and - * sizeof(unsigned long) == 4, that's 4 Megs. Two directory - * pages should be enough for a while. + * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 + * Megs 64 bit. 256 directories give enough room for a 512 + * Meg framebuffer with a max resolution of 12,800x10,240. + * Should be enough for a while with room leftover for + * expansion. */ - unsigned long pd[2]; + unsigned long pd[256]; }; /* - * Wart: xenkbd needs to know resolution. Put it here until a better - * solution is found, but don't leak it to the backend. + * Wart: xenkbd needs to know default resolution. Put it here until a + * better solution is found, but don't leak it to the backend. */ #ifdef __KERNEL__ #define XENFB_WIDTH 800 -- cgit v1.1 From 83abc70a4c6e306f4c1672e25884322f797e4fcb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:12 +0100 Subject: xen: make earlyprintk=xen work again For some perverse reason, if you call add_preferred_console() it prevents setup_early_printk() from successfully enabling the boot console - unless you make it a preferred console too... Also, make xenboot console output distinct from normal console output, since it gets repeated when the console handover happens, and the duplicated output is confusing without disambiguation. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Markus Armbruster <armbru@redhat.com> Cc: Gerd Hoffmann <kraxel@redhat.com> --- arch/x86/xen/enlighten.c | 1 + drivers/char/hvc_xen.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6cfb708..5c0635a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1257,6 +1257,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; if (!is_initial_xendomain()) { + add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); } diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c index 2413af3..d5fe0a8 100644 --- a/drivers/char/hvc_xen.c +++ b/drivers/char/hvc_xen.c @@ -155,6 +155,7 @@ static void xenboot_write_console(struct console *console, const char *string, raw_console_write(string, len); + write_console(0, "(early) ", 8); while (off < len && NULL != (pos = strchr(string+off, '\n'))) { linelen = pos-string+off; if (off + linelen > len) -- cgit v1.1 From ec9b2065d4d3b797604c09a569083dd9ff951b1b Mon Sep 17 00:00:00 2001 From: Isaku Yamahata <yamahata@valinux.co.jp> Date: Mon, 26 May 2008 23:31:13 +0100 Subject: xen: Move manage.c to drivers/xen for ia64/xen support move arch/x86/xen/manage.c under drivers/xen/to share codes with x86 and ia64. ia64/xen also uses manage.c Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/manage.c | 143 -------------------------------------------------- drivers/xen/Makefile | 2 +- drivers/xen/manage.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 145 deletions(-) delete mode 100644 arch/x86/xen/manage.c create mode 100644 drivers/xen/manage.c diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3d8df98..40b119b 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o manage.o xen-asm.o grant-table.o + time.o xen-asm.o grant-table.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c deleted file mode 100644 index aa7af9e..0000000 --- a/arch/x86/xen/manage.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Handle extern requests for shutdown, reboot and sysrq - */ -#include <linux/kernel.h> -#include <linux/err.h> -#include <linux/reboot.h> -#include <linux/sysrq.h> - -#include <xen/xenbus.h> - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 - -/* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; - -static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - char *str; - struct xenbus_transaction xbt; - int err; - - if (shutting_down != SHUTDOWN_INVALID) - return; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - - str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); - /* Ignore read errors and empty reads. */ - if (XENBUS_IS_ERR_READ(str)) { - xenbus_transaction_end(xbt, 1); - return; - } - - xenbus_write(xbt, "control", "shutdown", ""); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) { - kfree(str); - goto again; - } - - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) - orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); - else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); - shutting_down = SHUTDOWN_INVALID; - } - - kfree(str); -} - -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char sysrq_key = '\0'; - struct xenbus_transaction xbt; - int err; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); - xenbus_transaction_end(xbt, 1); - return; - } - - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - - if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); -} - -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - -static struct xenbus_watch sysrq_watch = { - .node = "control/sysrq", - .callback = sysrq_handler -}; - -static int setup_shutdown_watcher(void) -{ - int err; - - err = register_xenbus_watch(&shutdown_watch); - if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); - return err; - } - - err = register_xenbus_watch(&sysrq_watch); - if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); - return err; - } - - return 0; -} - -static int shutdown_event(struct notifier_block *notifier, - unsigned long event, - void *data) -{ - setup_shutdown_watcher(); - return NOTIFY_DONE; -} - -static int __init setup_shutdown_event(void) -{ - static struct notifier_block xenstore_notifier = { - .notifier_call = shutdown_event - }; - register_xenstore_notifier(&xenstore_notifier); - - return 0; -} - -subsys_initcall(setup_shutdown_event); diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 37af04f..363286c 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o +obj-y += grant-table.o features.o events.o manage.o obj-y += xenbus/ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c new file mode 100644 index 0000000..aa7af9e --- /dev/null +++ b/drivers/xen/manage.c @@ -0,0 +1,143 @@ +/* + * Handle extern requests for shutdown, reboot and sysrq + */ +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> + +#include <xen/xenbus.h> + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_SUSPEND 2 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + * report a crash, not be instructed to crash! + * HALT is the same as POWEROFF, as far as we're concerned. The tools use + * the distinction when we return the reason code to them. + */ +#define SHUTDOWN_HALT 4 + +/* Ignore multiple shutdown requests. */ +static int shutting_down = SHUTDOWN_INVALID; + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err; + + if (shutting_down != SHUTDOWN_INVALID) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (strcmp(str, "poweroff") == 0 || + strcmp(str, "halt") == 0) + orderly_poweroff(false); + else if (strcmp(str, "reboot") == 0) + ctrl_alt_del(); + else { + printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); +} + +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in " + "control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + + if (sysrq_key != '\0') + handle_sysrq(sysrq_key, NULL); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; + +static int setup_shutdown_watcher(void) +{ + int err; + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + return err; + } + + err = register_xenbus_watch(&sysrq_watch); + if (err) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + return err; + } + + return 0; +} + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +static int __init setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(setup_shutdown_event); -- cgit v1.1 From a90971ebddc81330f59203dee9803512aa4e2ef6 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata <yamahata@valinux.co.jp> Date: Mon, 26 May 2008 23:31:14 +0100 Subject: xen: compilation fix to balloon driver for ia64 support fix compilation error of ballon driver on ia64. extent_start member is pointer argument. On x86 pointer argument for xen hypercall is passed as virtual address. On the other hand, ia64 and ppc, pointer argument is passed in pseudo physical address. (guest physicall address.) So they must be passed as handle and convert right before issuing hypercall. CC drivers/xen/balloon.o linux-2.6-x86/drivers/xen/balloon.c: In function 'increase_reservation': linux-2.6-x86/drivers/xen/balloon.c:228: error: incompatible types in assignment linux-2.6-x86/drivers/xen/balloon.c: In function 'decrease_reservation': linux-2.6-x86/drivers/xen/balloon.c:324: error: incompatible types in assignment linux-2.6-x86/drivers/xen/balloon.c: In function 'dealloc_pte_fn': linux-2.6-x86/drivers/xen/balloon.c:486: error: incompatible types in assignment linux-2.6-x86/drivers/xen/balloon.c: In function 'alloc_empty_pages_and_pagevec': linux-2.6-x86/drivers/xen/balloon.c:522: error: incompatible types in assignment make[2]: *** [drivers/xen/balloon.o] Error 1 Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/xen/balloon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ab25ba6..097ba02 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -225,7 +225,7 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_next_page(page); } - reservation.extent_start = (unsigned long)frame_list; + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op( XENMEM_populate_physmap, &reservation); @@ -321,7 +321,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_append(pfn_to_page(pfn)); } - reservation.extent_start = (unsigned long)frame_list; + set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != nr_pages); @@ -483,7 +483,7 @@ static int dealloc_pte_fn( .extent_order = 0, .domid = DOMID_SELF }; - reservation.extent_start = (unsigned long)&mfn; + set_xen_guest_handle(reservation.extent_start, &mfn); set_pte_at(&init_mm, addr, pte, __pte_ma(0ull)); set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); @@ -519,7 +519,7 @@ static struct page **alloc_empty_pages_and_pagevec(int nr_pages) .extent_order = 0, .domid = DOMID_SELF }; - reservation.extent_start = (unsigned long)&gmfn; + set_xen_guest_handle(reservation.extent_start, &gmfn); ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); if (ret == 1) -- cgit v1.1 From bfdab126cfa6fe3c2ddb8b6007a38202b510b6c1 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata <yamahata@valinux.co.jp> Date: Mon, 26 May 2008 23:31:15 +0100 Subject: xen: add missing definitions in include/xen/interface/memory.h which ia64/xen needs Add xen handles realted definitions for xen memory which ia64/xen needs. Pointer argumsnts for ia64/xen hypercall are passed in pseudo physical address (guest physical address) so that it is required to convert guest kernel virtual address into pseudo physical address. The xen guest handle represents such arguments. Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/xen/interface/memory.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index da76846..af36ead 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -29,7 +29,7 @@ struct xen_memory_reservation { * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) */ - ulong extent_start; + GUEST_HANDLE(ulong) extent_start; /* Number of extents, and size/alignment of each (2^extent_order pages). */ unsigned long nr_extents; @@ -50,6 +50,7 @@ struct xen_memory_reservation { domid_t domid; }; +DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); /* * Returns the maximum machine frame number of mapped RAM in this system. @@ -85,7 +86,7 @@ struct xen_machphys_mfn_list { * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ - ulong extent_start; + GUEST_HANDLE(ulong) extent_start; /* * Number of extents written to the above array. This will be smaller @@ -93,6 +94,7 @@ struct xen_machphys_mfn_list { */ unsigned int nr_extents; }; +DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); /* * Sets the GPFN at which a particular page appears in the specified guest's @@ -115,6 +117,7 @@ struct xen_add_to_physmap { /* GPFN where the source mapping page should appear. */ unsigned long gpfn; }; +DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap); /* * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error @@ -129,13 +132,14 @@ struct xen_translate_gpfn_list { unsigned long nr_gpfns; /* List of GPFNs to translate. */ - ulong gpfn_list; + GUEST_HANDLE(ulong) gpfn_list; /* * Output list to contain MFN translations. May be the same as the input * list (in which case each input GPFN is overwritten with the output MFN). */ - ulong mfn_list; + GUEST_HANDLE(ulong) mfn_list; }; +DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list); #endif /* __XEN_PUBLIC_MEMORY_H__ */ -- cgit v1.1 From 38bb5ab4179572f4d24d3ca7188172a31ca51a69 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:16 +0100 Subject: xen: count resched interrupts properly Make sure resched interrupts appear in /proc/interrupts in the proper place. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/smp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 94e6900..74ab896 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -65,6 +65,12 @@ static struct call_data_struct *call_data; */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { +#ifdef CONFIG_X86_32 + __get_cpu_var(irq_stat).irq_resched_count++; +#else + add_pda(irq_resched_count, 1); +#endif + return IRQ_HANDLED; } -- cgit v1.1 From 955d6f1778da5a9795f2dfb07f760006f194609a Mon Sep 17 00:00:00 2001 From: Adrian Bunk <bunk@kernel.org> Date: Mon, 26 May 2008 23:31:17 +0100 Subject: xen: drivers/xen/balloon.c: make a function static Make the needlessly global balloon_set_new_target() static. Signed-off-by: Adrian Bunk <bunk@kernel.org> Acked-by: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/xen/balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 097ba02..591bc29 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -368,7 +368,7 @@ static void balloon_process(struct work_struct *work) } /* Resets the Xen limit, sets new target, and kicks off processing. */ -void balloon_set_new_target(unsigned long target) +static void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.hard_limit = ~0UL; -- cgit v1.1 From d451bb7aa852627bdf7be7937dc3d9d9f261b235 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:18 +0100 Subject: xen: make phys_to_machine structure dynamic We now support the use of memory hotplug, so the physical to machine page mapping structure must be dynamic. This is implemented as a two-level radix tree structure, which allows us to efficiently incrementally allocate memory for the p2m table as new pages are added. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/setup.c | 2 -- arch/x86/xen/xen-ops.h | 2 ++ include/asm-x86/xen/page.h | 20 ++++------- 5 files changed, 94 insertions(+), 17 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5c0635a..73d3c84 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1221,7 +1221,7 @@ asmlinkage void __init xen_start_kernel(void) /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + xen_build_dynamic_phys_to_machine(); pgd = (pgd_t *)xen_start_info->pt_base; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 07c2653..c3b27de 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,6 +56,91 @@ #include "multicalls.h" #include "mmu.h" +/* + * This should probably be a config option. On 32-bit, it costs 1 + * page/gig of memory; on 64-bit its 2 pages/gig. If we want it to be + * completely unbounded we can add another level to the p2m structure. + */ +#define MAX_GUEST_PAGES (16ull * 1024*1024*1024 / PAGE_SIZE) +#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) + +static unsigned long *p2m_top[MAX_GUEST_PAGES / P2M_ENTRIES_PER_PAGE]; + +static inline unsigned p2m_top_index(unsigned long pfn) +{ + BUG_ON(pfn >= MAX_GUEST_PAGES); + return pfn / P2M_ENTRIES_PER_PAGE; +} + +static inline unsigned p2m_index(unsigned long pfn) +{ + return pfn % P2M_ENTRIES_PER_PAGE; +} + +void __init xen_build_dynamic_phys_to_machine(void) +{ + unsigned pfn; + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + + BUG_ON(xen_start_info->nr_pages >= MAX_GUEST_PAGES); + + for(pfn = 0; + pfn < xen_start_info->nr_pages; + pfn += P2M_ENTRIES_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + + p2m_top[topidx] = &mfn_list[pfn]; + } +} + +unsigned long get_phys_to_machine(unsigned long pfn) +{ + unsigned topidx, idx; + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == NULL) + return INVALID_P2M_ENTRY; + + idx = p2m_index(pfn); + return p2m_top[topidx][idx]; +} + +static void alloc_p2m(unsigned long **pp) +{ + unsigned long *p; + unsigned i; + + p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(p == NULL); + + for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + p[i] = INVALID_P2M_ENTRY; + + if (cmpxchg(pp, NULL, p) != NULL) + free_page((unsigned long)p); +} + +void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, idx; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == NULL) { + /* no need to allocate a page to store an invalid entry */ + if (mfn == INVALID_P2M_ENTRY) + return; + alloc_p2m(&p2m_top[topidx]); + } + + idx = p2m_index(pfn); + p2m_top[topidx][idx] = mfn; +} + xmaddr_t arbitrary_virt_to_machine(unsigned long address) { unsigned int level; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 82517e4..37f8f0b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,8 +27,6 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; -unsigned long *phys_to_machine_mapping; -EXPORT_SYMBOL(phys_to_machine_mapping); /** * machine_specific_memory_setup - Hook for machine specific memory setup. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f1063ae..7bdc8c5 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -22,6 +22,8 @@ void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void __init xen_build_dynamic_phys_to_machine(void); + void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); unsigned long xen_cpu_khz(void); diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index e11f240..293344f 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -26,15 +26,15 @@ typedef struct xpaddr { #define FOREIGN_FRAME_BIT (1UL<<31) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) -extern unsigned long *phys_to_machine_mapping; +extern unsigned long get_phys_to_machine(unsigned long pfn); +extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); static inline unsigned long pfn_to_mfn(unsigned long pfn) { if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - return phys_to_machine_mapping[(unsigned int)(pfn)] & - ~FOREIGN_FRAME_BIT; + return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; } static inline int phys_to_machine_mapping_valid(unsigned long pfn) @@ -42,7 +42,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); + return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; } static inline unsigned long mfn_to_pfn(unsigned long mfn) @@ -106,20 +106,12 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) unsigned long pfn = mfn_to_pfn(mfn); if ((pfn < max_mapnr) && !xen_feature(XENFEAT_auto_translated_physmap) - && (phys_to_machine_mapping[pfn] != mfn)) + && (get_phys_to_machine(pfn) != mfn)) return max_mapnr; /* force !pfn_valid() */ + /* XXX fixme; not true with sparsemem */ return pfn; } -static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; - } - phys_to_machine_mapping[pfn] = mfn; -} - /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) #define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) -- cgit v1.1 From 8006ec3e911f93d702e1d4a4e387e244ab434924 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:19 +0100 Subject: xen: add configurable max domain size Add a config option to set the max size of a Xen domain. This is used to scale the size of the physical-to-machine array; it ends up using around 1 page/GByte, so there's no reason to be very restrictive. For a 32-bit guest, the default value of 8GB is probably sufficient; there's not much point in giving a 32-bit machine much more memory than that. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/Kconfig | 10 ++++++++++ arch/x86/xen/mmu.c | 25 ++++++++++++------------- arch/x86/xen/setup.c | 3 +++ include/asm-x86/xen/page.h | 5 +++++ 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 525b108..d0f1c7c 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -11,3 +11,13 @@ config XEN This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the Xen hypervisor. + +config XEN_MAX_DOMAIN_MEMORY + int "Maximum allowed size of a domain in gigabytes" + default 8 + depends on XEN + help + The pseudo-physical to machine address array is sized + according to the maximum possible memory size of a Xen + domain. This array uses 1 page per gigabyte, so there's no + need to be too stingy here. \ No newline at end of file diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c3b27de..644232a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,19 +56,13 @@ #include "multicalls.h" #include "mmu.h" -/* - * This should probably be a config option. On 32-bit, it costs 1 - * page/gig of memory; on 64-bit its 2 pages/gig. If we want it to be - * completely unbounded we can add another level to the p2m structure. - */ -#define MAX_GUEST_PAGES (16ull * 1024*1024*1024 / PAGE_SIZE) #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -static unsigned long *p2m_top[MAX_GUEST_PAGES / P2M_ENTRIES_PER_PAGE]; +static unsigned long *p2m_top[MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE]; static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_GUEST_PAGES); + BUG_ON(pfn >= MAX_DOMAIN_PAGES); return pfn / P2M_ENTRIES_PER_PAGE; } @@ -81,12 +75,9 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned pfn; unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - BUG_ON(xen_start_info->nr_pages >= MAX_GUEST_PAGES); - - for(pfn = 0; - pfn < xen_start_info->nr_pages; - pfn += P2M_ENTRIES_PER_PAGE) { + for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@ -97,6 +88,9 @@ unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, idx; + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + return INVALID_P2M_ENTRY; + topidx = p2m_top_index(pfn); if (p2m_top[topidx] == NULL) return INVALID_P2M_ENTRY; @@ -129,6 +123,11 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) return; } + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return; + } + topidx = p2m_top_index(pfn); if (p2m_top[topidx] == NULL) { /* no need to allocate a page to store an invalid entry */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 37f8f0b..4884478 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -16,6 +16,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/page.h> #include <xen/interface/callback.h> #include <xen/interface/physdev.h> #include <xen/features.h> @@ -36,6 +37,8 @@ char * __init xen_memory_setup(void) { unsigned long max_pfn = xen_start_info->nr_pages; + max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index 293344f..377c045 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -26,6 +26,11 @@ typedef struct xpaddr { #define FOREIGN_FRAME_BIT (1UL<<31) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +/* Maximum amount of memory we can handle in a domain in pages */ +#define MAX_DOMAIN_PAGES \ + ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) + + extern unsigned long get_phys_to_machine(unsigned long pfn); extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); -- cgit v1.1 From cf0923ea295ba08ae656ef04164a43cb6553ba99 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:20 +0100 Subject: xen: efficiently support a holey p2m table When using sparsemem and memory hotplug, the kernel's pseudo-physical address space can be discontigious. Previously this was dealt with by having the upper parts of the radix tree stubbed off. Unfortunately, this is incompatible with save/restore, which requires a complete p2m table. The solution is to have a special distinguished all-invalid p2m leaf page, which we can point all the hole areas at. This allows the tools to see a complete p2m table, but it only costs a page for all memory holes. It also simplifies the code since it removes a few special cases. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/mmu.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 644232a..da7b45b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -57,8 +57,17 @@ #include "mmu.h" #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) -static unsigned long *p2m_top[MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE]; +/* Placeholder for holes in the address space */ +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] + __attribute__((section(".data.page_aligned"))) = + { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; + + /* Array of pointers to pages containing p2m entries */ +static unsigned long *p2m_top[TOP_ENTRIES] + __attribute__((section(".data.page_aligned"))) = + { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -92,9 +101,6 @@ unsigned long get_phys_to_machine(unsigned long pfn) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == NULL) - return INVALID_P2M_ENTRY; - idx = p2m_index(pfn); return p2m_top[topidx][idx]; } @@ -110,7 +116,7 @@ static void alloc_p2m(unsigned long **pp) for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; - if (cmpxchg(pp, NULL, p) != NULL) + if (cmpxchg(pp, p2m_missing, p) != p2m_missing) free_page((unsigned long)p); } @@ -129,7 +135,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == NULL) { + if (p2m_top[topidx] == p2m_missing) { /* no need to allocate a page to store an invalid entry */ if (mfn == INVALID_P2M_ENTRY) return; -- cgit v1.1 From a0d695c821544947342a2d372ec4108bc813b979 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:21 +0100 Subject: xen: make dummy_shared_info non-static Rename dummy_shared_info to xen_dummy_shared_info and make it non-static, in anticipation of users outside of enlighten.c Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 6 +++--- arch/x86/xen/xen-ops.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73d3c84..d9faaa2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,13 +75,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); -static /* __initdata */ struct shared_info dummy_shared_info; +struct shared_info xen_dummy_shared_info; /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -104,7 +104,7 @@ static void __init xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; - BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info); + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; if (!have_vcpu_info_placement) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7bdc8c5..826d6e4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -15,6 +15,7 @@ DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; +extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; char * __init xen_memory_setup(void); -- cgit v1.1 From d5edbc1f75420935b1ec7e65df10c8f81cea82de Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:22 +0100 Subject: xen: add p2m mfn_list_list When saving a domain, the Xen tools need to remap all our mfns to portable pfns. In order to remap our p2m table, it needs to know where all its pages are, so maintain the references to the p2m table for it to use. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 39 ++++++++++++++++++++++++++++++++++++--- arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d9faaa2..ce67dc8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -880,6 +880,8 @@ static __init void setup_shared_info(void) /* In UP this is as good a place as any to set up shared info */ xen_setup_vcpu_info_placement(); #endif + + xen_setup_mfn_list_list(); } static __init void xen_pagetable_setup_done(pgd_t *base) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index da7b45b..4740cda 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -69,6 +69,13 @@ static unsigned long *p2m_top[TOP_ENTRIES] __attribute__((section(".data.page_aligned"))) = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +/* Arrays of p2m arrays expressed in mfns used for save/restore */ +static unsigned long p2m_top_mfn[TOP_ENTRIES] + __attribute__((section(".bss.page_aligned"))); + +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __attribute__((section(".bss.page_aligned"))); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_DOMAIN_PAGES); @@ -80,11 +87,35 @@ static inline unsigned p2m_index(unsigned long pfn) return pfn % P2M_ENTRIES_PER_PAGE; } +/* Build the parallel p2m_top_mfn structures */ +void xen_setup_mfn_list_list(void) +{ + unsigned pfn, idx; + + for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + + p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + } + + for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; + p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + } + + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn_list); + HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; +} + +/* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned pfn; unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned pfn; for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); @@ -105,7 +136,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) return p2m_top[topidx][idx]; } -static void alloc_p2m(unsigned long **pp) +static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) { unsigned long *p; unsigned i; @@ -118,6 +149,8 @@ static void alloc_p2m(unsigned long **pp) if (cmpxchg(pp, p2m_missing, p) != p2m_missing) free_page((unsigned long)p); + else + *mfnp = virt_to_mfn(p); } void set_phys_to_machine(unsigned long pfn, unsigned long mfn) @@ -139,7 +172,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) /* no need to allocate a page to store an invalid entry */ if (mfn == INVALID_P2M_ENTRY) return; - alloc_p2m(&p2m_top[topidx]); + alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]); } idx = p2m_index(pfn); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 826d6e4..a1bc89a 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -18,6 +18,8 @@ extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; +void xen_setup_mfn_list_list(void); + char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); -- cgit v1.1 From eb1e305f4ef201e549ffd475b7dcbcd4ec36d7dc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:23 +0100 Subject: xen: add rebind_evtchn_irq Add rebind_evtchn_irq(), which will rebind an device driver's existing irq to a new event channel on restore. Since the new event channel will be masked and bound to vcpu0, we update the state accordingly and unmask the irq once everything is set up. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/xen/events.c | 27 +++++++++++++++++++++++++++ include/xen/events.h | 1 + 2 files changed, 28 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b..f64e979 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -557,6 +557,33 @@ out: put_cpu(); } +/* Rebind a new event channel to an existing irq. */ +void rebind_evtchn_irq(int evtchn, int irq) +{ + /* Make sure the irq is masked, since the new event channel + will also be masked. */ + disable_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + /* After resume the irq<->evtchn mappings are all cleared out */ + BUG_ON(evtchn_to_irq[evtchn] != -1); + /* Expect irq to have been bound before, + so the bindcount should be non-0 */ + BUG_ON(irq_bindcount[irq] == 0); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + + spin_unlock(&irq_mapping_update_lock); + + /* new event channels are always bound to cpu 0 */ + irq_set_affinity(irq, cpumask_of_cpu(0)); + + /* Unmask the event channel. */ + enable_irq(irq); +} + /* Rebind an evtchn so that it gets delivered to a specific cpu */ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { diff --git a/include/xen/events.h b/include/xen/events.h index acd8e06..a82ec0c 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -32,6 +32,7 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector); int resend_irq_on_evtchn(unsigned int irq); +void rebind_evtchn_irq(int evtchn, int irq); static inline void notify_remote_via_evtchn(int port) { -- cgit v1.1 From 0f2287ad7c61f10b2a22a06e2a66cdbbbfc44ad0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:24 +0100 Subject: xen: fix unbind_from_irq() Rearrange the tests in unbind_from_irq() so that we can still unbind an irq even if the underlying event channel is bad. This allows a device driver to shuffle its irqs on save/restore before the underlying event channels have been fixed up. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/xen/events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index f64e979..70375a6 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -355,7 +355,7 @@ static void unbind_from_irq(unsigned int irq) spin_lock(&irq_mapping_update_lock); - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { close.port = evtchn; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) BUG(); @@ -375,7 +375,7 @@ static void unbind_from_irq(unsigned int irq) evtchn_to_irq[evtchn] = -1; irq_info[irq] = IRQ_UNBOUND; - dynamic_irq_init(irq); + dynamic_irq_cleanup(irq); } spin_unlock(&irq_mapping_update_lock); -- cgit v1.1 From 6b9b732d0e396a3f1a95977162a8624aafce38a1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:25 +0100 Subject: xen-console: add save/restore Add code to: 1. Deal with the console page being canonicalized. During save, the console's mfn in the start_info structure is canonicalized to a pfn. In order to deal with that, we always use a copy of the pfn and indirect off that all the time. However, we fall back to using the mfn if the pfn hasn't been initialized yet. 2. Restore the console event channel, and rebind it to the existing irq. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/char/hvc_xen.c | 25 +++++++++++++++++++++---- include/xen/hvc-console.h | 2 ++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c index d5fe0a8..db2ae42 100644 --- a/drivers/char/hvc_xen.c +++ b/drivers/char/hvc_xen.c @@ -39,9 +39,14 @@ static int xencons_irq; /* ------------------------------------------------------------------ */ +static unsigned long console_pfn = ~0ul; + static inline struct xencons_interface *xencons_interface(void) { - return mfn_to_virt(xen_start_info->console.domU.mfn); + if (console_pfn == ~0ul) + return mfn_to_virt(xen_start_info->console.domU.mfn); + else + return __va(console_pfn << PAGE_SHIFT); } static inline void notify_daemon(void) @@ -101,20 +106,32 @@ static int __init xen_init(void) { struct hvc_struct *hp; - if (!is_running_on_xen()) - return 0; + if (!is_running_on_xen() || + is_initial_xendomain() || + !xen_start_info->console.domU.evtchn) + return -ENODEV; xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); if (xencons_irq < 0) - xencons_irq = 0 /* NO_IRQ */; + xencons_irq = 0; /* NO_IRQ */ + hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256); if (IS_ERR(hp)) return PTR_ERR(hp); hvc = hp; + + console_pfn = mfn_to_pfn(xen_start_info->console.domU.mfn); + return 0; } +void xen_console_resume(void) +{ + if (xencons_irq) + rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq); +} + static void __exit xen_fini(void) { if (hvc) diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index efc3237..fd5483a 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -3,6 +3,8 @@ extern struct console xenboot_console; +void xen_console_resume(void); + void xen_raw_console_write(const char *str); void xen_raw_printk(const char *fmt, ...); -- cgit v1.1 From 7d88d32a4670af583c896e5ecd3929b78538ca62 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:26 +0100 Subject: xenbus: rebind irq on restore When restoring, rebind the existing xenbus irq to the new xenbus event channel. (It turns out in practice that this is always the same, and is never updated on restore. That's a bug, but Xeno-linux has been like this for a long time, so it can't really be fixed.) Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- drivers/xen/xenbus/xenbus_comms.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 6efbe3f..090c61e 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -203,7 +203,6 @@ int xb_read(void *data, unsigned len) int xb_init_comms(void) { struct xenstore_domain_interface *intf = xen_store_interface; - int err; if (intf->req_prod != intf->req_cons) printk(KERN_ERR "XENBUS request ring is not quiescent " @@ -216,18 +215,20 @@ int xb_init_comms(void) intf->rsp_cons = intf->rsp_prod; } - if (xenbus_irq) - unbind_from_irqhandler(xenbus_irq, &xb_waitq); + if (xenbus_irq) { + /* Already have an irq; assume we're resuming */ + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq); + } else { + int err; + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, + 0, "xenbus", &xb_waitq); + if (err <= 0) { + printk(KERN_ERR "XENBUS request irq failed %i\n", err); + return err; + } - err = bind_evtchn_to_irqhandler( - xen_store_evtchn, wake_waiting, - 0, "xenbus", &xb_waitq); - if (err <= 0) { - printk(KERN_ERR "XENBUS request irq failed %i\n", err); - return err; + xenbus_irq = err; } - xenbus_irq = err; - return 0; } -- cgit v1.1 From 0e91398f2a5d4eb6b07df8115917d0d1cf3e9b58 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:27 +0100 Subject: xen: implement save/restore This patch implements Xen save/restore and migration. Saving is triggered via xenbus, which is polled in drivers/xen/manage.c. When a suspend request comes in, the kernel prepares itself for saving by: 1 - Freeze all processes. This is primarily to prevent any partially-completed pagetable updates from confusing the suspend process. If CONFIG_PREEMPT isn't defined, then this isn't necessary. 2 - Suspend xenbus and other devices 3 - Stop_machine, to make sure all the other vcpus are quiescent. The Xen tools require the domain to run its save off vcpu0. 4 - Within the stop_machine state, it pins any unpinned pgds (under construction or destruction), performs canonicalizes various other pieces of state (mostly converting mfns to pfns), and finally 5 - Suspend the domain Restore reverses the steps used to save the domain, ending when all the frozen processes are thawed. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 6 +-- arch/x86/xen/mmu.c | 46 +++++++++++++++++ arch/x86/xen/smp.c | 2 +- arch/x86/xen/suspend.c | 42 +++++++++++++++ arch/x86/xen/time.c | 8 +++ arch/x86/xen/xen-ops.h | 4 ++ drivers/xen/events.c | 83 +++++++++++++++++++++++++++++ drivers/xen/grant-table.c | 4 +- drivers/xen/manage.c | 126 ++++++++++++++++++++++++++++++++++++++++----- include/linux/page-flags.h | 1 + include/xen/events.h | 3 ++ include/xen/grant_table.h | 3 ++ include/xen/xen-ops.h | 9 ++++ 14 files changed, 318 insertions(+), 21 deletions(-) create mode 100644 arch/x86/xen/suspend.c diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b119b..2ba2d16 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o + time.o xen-asm.o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ce67dc8..b94f63a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) PFN_DOWN(__pa(xen_start_info->pt_base))); } -static __init void setup_shared_info(void) +void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; - setup_shared_info(); + xen_setup_shared_info(); /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ @@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) } /* This is called once we have the cpu_possible_map */ -void __init xen_setup_vcpu_info_placement(void) +void xen_setup_vcpu_info_placement(void) { int cpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4740cda..e959559 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + */ +void xen_mm_pin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + xen_pgd_pin((pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ @@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + printk("unpinning pinned %p\n", page_address(page)); + xen_pgd_unpin((pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 74ab896..d2e3c20 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -35,7 +35,7 @@ #include "xen-ops.h" #include "mmu.h" -static cpumask_t xen_cpu_initialized_map; +cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq) = -1; static DEFINE_PER_CPU(int, callfunc_irq) = -1; static DEFINE_PER_CPU(int, debug_irq) = -1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c new file mode 100644 index 0000000..7620a16 --- /dev/null +++ b/arch/x86/xen/suspend.c @@ -0,0 +1,42 @@ +#include <linux/types.h> + +#include <xen/interface/xen.h> +#include <xen/grant_table.h> +#include <xen/events.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +#include "xen-ops.h" +#include "mmu.h" + +void xen_pre_suspend(void) +{ + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +void xen_post_suspend(int suspend_cancelled) +{ + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + xen_cpu_initialized_map = cpu_online_map; +#endif + } + + xen_setup_shared_info(); +} + diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..0bef256 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } +void xen_time_suspend(void) +{ +} + +void xen_time_resume(void) +{ +} + __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a1bc89a..a0503ac 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -9,6 +9,7 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +struct trap_info; void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(unsigned long, xen_cr3); @@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); +void xen_setup_shared_info(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); @@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait); +extern cpumask_t xen_cpu_initialized_map; + /* Declare an asm function, along with symbols needed to make it inlineable */ diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 70375a6..73d78dc 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -674,6 +674,89 @@ static int retrigger_dynirq(unsigned int irq) return ret; } +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int virq, irq, evtchn; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + + BUG_ON(irq_info[irq].type != IRQT_VIRQ); + BUG_ON(irq_info[irq].index != virq); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int ipi, irq, evtchn; + + for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + + BUG_ON(irq_info[irq].type != IRQT_IPI); + BUG_ON(irq_info[irq].index != ipi); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + + } +} + +void xen_irq_resume(void) +{ + unsigned int cpu, irq, evtchn; + + init_evtchn_cpu_bindings(); + + /* New event-channel space is not 'live' yet. */ + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + mask_evtchn(evtchn); + + /* No IRQ <-> event-channel mappings. */ + for (irq = 0; irq < NR_IRQS; irq++) + irq_info[irq].evtchn = 0; /* zap event-channel binding */ + + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + evtchn_to_irq[evtchn] = -1; + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } +} + static struct irq_chip xen_dynamic_chip __read_mostly = { .name = "xen-dyn", .mask = disable_dynirq, diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 52b6b41..e9e1116 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -471,14 +471,14 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) return 0; } -static int gnttab_resume(void) +int gnttab_resume(void) { if (max_nr_grant_frames() < nr_grant_frames) return -ENOSYS; return gnttab_map(0, nr_grant_frames - 1); } -static int gnttab_suspend(void) +int gnttab_suspend(void) { arch_gnttab_unmap_shared(shared, nr_grant_frames); return 0; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index aa7af9e..ba85fa2 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -5,21 +5,113 @@ #include <linux/err.h> #include <linux/reboot.h> #include <linux/sysrq.h> +#include <linux/stop_machine.h> +#include <linux/freezer.h> #include <xen/xenbus.h> - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 +#include <xen/grant_table.h> +#include <xen/events.h> +#include <xen/hvc-console.h> +#include <xen/xen-ops.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +enum shutdown_state { + SHUTDOWN_INVALID = -1, + SHUTDOWN_POWEROFF = 0, + SHUTDOWN_SUSPEND = 2, + /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + report a crash, not be instructed to crash! + HALT is the same as POWEROFF, as far as we're concerned. The tools use + the distinction when we return the reason code to them. */ + SHUTDOWN_HALT = 4, +}; /* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; +static enum shutdown_state shutting_down = SHUTDOWN_INVALID; + +static int xen_suspend(void *data) +{ + int *cancelled = data; + + BUG_ON(!irqs_disabled()); + + load_cr3(swapper_pg_dir); + + xen_mm_pin_all(); + gnttab_suspend(); + xen_time_suspend(); + xen_pre_suspend(); + + /* + * This hypercall returns 1 if suspend was cancelled + * or the domain was merely checkpointed, and 0 if it + * is resuming in a new domain. + */ + *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + + xen_post_suspend(*cancelled); + xen_time_resume(); + gnttab_resume(); + xen_mm_unpin_all(); + + if (!*cancelled) { + xen_irq_resume(); + xen_console_resume(); + } + + return 0; +} + +static void do_suspend(void) +{ + int err; + int cancelled = 1; + + shutting_down = SHUTDOWN_SUSPEND; + +#ifdef CONFIG_PREEMPT + /* If the kernel is preemptible, we need to freeze all the processes + to prevent them from being in the middle of a pagetable update + during suspend. */ + err = freeze_processes(); + if (err) { + printk(KERN_ERR "xen suspend: freeze failed %d\n", err); + return; + } +#endif + + err = device_suspend(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen suspend: device_suspend %d\n", err); + goto out; + } + + printk("suspending xenbus...\n"); + /* XXX use normal device tree? */ + xenbus_suspend(); + + err = stop_machine_run(xen_suspend, &cancelled, 0); + if (err) { + printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + goto out; + } + + if (!cancelled) + xenbus_resume(); + else + xenbus_suspend_cancel(); + + device_resume(); + + +out: +#ifdef CONFIG_PREEMPT + thaw_processes(); +#endif + shutting_down = SHUTDOWN_INVALID; +} static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) @@ -52,11 +144,17 @@ static void shutdown_handler(struct xenbus_watch *watch, } if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) + strcmp(str, "halt") == 0) { + shutting_down = SHUTDOWN_POWEROFF; orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) + } else if (strcmp(str, "reboot") == 0) { + shutting_down = SHUTDOWN_POWEROFF; /* ? */ ctrl_alt_del(); - else { +#ifdef CONFIG_PM_SLEEP + } else if (strcmp(str, "suspend") == 0) { + do_suspend(); +#endif + } else { printk(KERN_INFO "Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 590cff3..02955a1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */ PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */ +PAGEFLAG(SavePinned, dirty); /* Xen */ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) diff --git a/include/xen/events.h b/include/xen/events.h index a82ec0c..67c4436 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -41,4 +41,7 @@ static inline void notify_remote_via_evtchn(int port) } extern void notify_remote_via_irq(int irq); + +extern void xen_irq_resume(void); + #endif /* _XEN_EVENTS_H */ diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 4662048..a40f1cd 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -51,6 +51,9 @@ struct gnttab_free_callback { u16 count; }; +int gnttab_suspend(void); +int gnttab_resume(void); + int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 10ddfe0..5d7a6db 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,4 +5,13 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +void xen_pre_suspend(void); +void xen_post_suspend(int suspend_cancelled); + +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); + +void xen_time_suspend(void); +void xen_time_resume(void); + #endif /* INCLUDE_XEN_OPS_H */ -- cgit v1.1 From 359cdd3f866b6219a6729e313faf2221397f3278 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 26 May 2008 23:31:28 +0100 Subject: xen: maintain clock offset over save/restore Hook into the device model to make sure that timekeeping's resume handler is called. This deals with our clocksource's non-monotonicity over the save/restore. Explicitly call clock_has_changed() to make sure that all the timers get retriggered properly. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/xen/time.c | 8 -------- drivers/xen/manage.c | 15 ++++++++++++--- include/xen/xen-ops.h | 3 --- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0bef256..c39e1a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,14 +572,6 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } -void xen_time_suspend(void) -{ -} - -void xen_time_resume(void) -{ -} - __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ba85fa2..675c3dd 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -34,14 +34,21 @@ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; static int xen_suspend(void *data) { int *cancelled = data; + int err; BUG_ON(!irqs_disabled()); load_cr3(swapper_pg_dir); + err = device_power_down(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen_suspend: device_power_down failed: %d\n", + err); + return err; + } + xen_mm_pin_all(); gnttab_suspend(); - xen_time_suspend(); xen_pre_suspend(); /* @@ -52,10 +59,11 @@ static int xen_suspend(void *data) *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); xen_post_suspend(*cancelled); - xen_time_resume(); gnttab_resume(); xen_mm_unpin_all(); + device_power_up(); + if (!*cancelled) { xen_irq_resume(); xen_console_resume(); @@ -105,7 +113,8 @@ static void do_suspend(void) device_resume(); - + /* Make sure timer events get retriggered on all CPUs */ + clock_was_set(); out: #ifdef CONFIG_PREEMPT thaw_processes(); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 5d7a6db..a706d6a 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -11,7 +11,4 @@ void xen_post_suspend(int suspend_cancelled); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); -void xen_time_suspend(void); -void xen_time_resume(void); - #endif /* INCLUDE_XEN_OPS_H */ -- cgit v1.1 From 3945e2c9abf8e00c2edc4aa29215ddfad1cd8cf7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 25 May 2008 10:00:09 -0700 Subject: x86: extend e820 ealy_res support 32bit - fix #2 remove extra -1 in reseve_early calling panic if can not find space for new RAMDISK Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/setup_32.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index c04e8c9..08b47a2 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -479,7 +479,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= end_of_lowmem/2) { - free_early(ramdisk_image, ramdisk_image + ramdisk_size - 1); + free_early(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -501,9 +501,13 @@ static void __init reserve_initrd(void) end_of_lowmem, ramdisk_size, PAGE_SIZE); + if (ramdisk_here == -1ULL) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size - 1, + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -579,15 +583,15 @@ void __init setup_bootmem_allocator(void) PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); - reserve_early(bootmap, bootmap + bootmap_size - 1, "BOOTMAP"); + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); #ifdef CONFIG_BLK_DEV_INITRD reserve_initrd(); #endif bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " low ram: %08lx - %08lx\n", min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); - printk(KERN_INFO " bootmap [%08lx - %08lx]\n", - bootmap, bootmap + bootmap_size - 1); + printk(KERN_INFO " bootmap %08lx - %08lx\n", + bootmap, bootmap + bootmap_size); register_bootmem_low_pages(max_low_pfn); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); -- cgit v1.1 From dd564d0cf08686cf0cc332bf9d48cba5b26a8171 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@ucw.cz> Date: Tue, 27 May 2008 18:03:56 +0200 Subject: x86: aperture_64.c: cleanups Some small cleanups for aperture_64.c; they should not really change any code. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: Dave Jones <davej@codemonkey.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5373f78..6eea42e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -111,7 +111,7 @@ static u32 __init allocate_aperture(void) /* Find a PCI capability */ -static __u32 __init find_cap(int bus, int slot, int func, int cap) +static u32 __init find_cap(int bus, int slot, int func, int cap) { int bytes; u8 pos; @@ -137,7 +137,7 @@ static __u32 __init find_cap(int bus, int slot, int func, int cap) } /* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) +static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) { u32 apsize; u32 apsizereg; @@ -202,7 +202,7 @@ static __u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. */ -static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +static u32 __init search_agp_bridge(u32 *order, int *valid_agp) { int bus, slot, func; -- cgit v1.1 From b20aeccd6ad42ccb6be1b3d1d32618ddd2b31bf0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 28 May 2008 14:24:38 +0200 Subject: xen: fix early bootup crash on native hardware -tip tree auto-testing found the following early bootup hang: --------------> get_memcfg_from_srat: assigning address to rsdp RSD PTR v0 [Nvidia] BUG: Int 14: CR2 ffd00040 EDI 8092fbfe ESI ffd00040 EBP 80b0aee8 ESP 80b0aed0 EBX 000f76f0 EDX 0000000e ECX 00000003 EAX ffd00040 err 00000000 EIP 802c055a CS 00000060 flg 00010006 Stack: ffd00040 80bc78d0 80b0af6c 80b1dbfe 8093d8ba 00000008 80b42810 80b4ddb4 80b42842 00000000 80b0af1c 801079c8 808e724e 00000000 80b42871 802c0531 00000100 00000000 0003fff0 80b0af40 80129999 00040100 00040100 00000000 Pid: 0, comm: swapper Not tainted 2.6.26-rc4-sched-devel.git #570 [<802c055a>] ? strncmp+0x11/0x25 [<80b1dbfe>] ? get_memcfg_from_srat+0xb4/0x568 [<801079c8>] ? mcount_call+0x5/0x9 [<802c0531>] ? strcmp+0xa/0x22 [<80129999>] ? printk+0x38/0x3a [<80129999>] ? printk+0x38/0x3a [<8011b122>] ? memory_present+0x66/0x6f [<80b216b4>] ? setup_memory+0x13/0x40c [<80b16b47>] ? propagate_e820_map+0x80/0x97 [<80b1622a>] ? setup_arch+0x248/0x477 [<80129999>] ? printk+0x38/0x3a [<80b11759>] ? start_kernel+0x6e/0x2eb [<80b110fc>] ? i386_start_kernel+0xeb/0xf2 ======================= <------ with this config: http://redhat.com/~mingo/misc/config-Wed_May_28_01_33_33_CEST_2008.bad The thing is, the crash makes little sense at first sight. We crash on a benign-looking printk. The code around it got changed in -tip but checking those topic branches individually did not reproduce the bug. Bisection led to this commit: | d5edbc1f75420935b1ec7e65df10c8f81cea82de is first bad commit | commit d5edbc1f75420935b1ec7e65df10c8f81cea82de | Author: Jeremy Fitzhardinge <jeremy@goop.org> | Date: Mon May 26 23:31:22 2008 +0100 | | xen: add p2m mfn_list_list Which is somewhat surprising, as on native hardware Xen client side should have little to no side-effects. After some head scratching, it turns out the following happened: randconfig enabled the following Xen options: CONFIG_XEN=y CONFIG_XEN_MAX_DOMAIN_MEMORY=8 # CONFIG_XEN_BLKDEV_FRONTEND is not set # CONFIG_XEN_NETDEV_FRONTEND is not set CONFIG_HVC_XEN=y # CONFIG_XEN_BALLOON is not set which activated this piece of code in arch/x86/xen/mmu.c: > @@ -69,6 +69,13 @@ > __attribute__((section(".data.page_aligned"))) = > { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; > > +/* Arrays of p2m arrays expressed in mfns used for save/restore */ > +static unsigned long p2m_top_mfn[TOP_ENTRIES] > + __attribute__((section(".bss.page_aligned"))); > + > +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] > + __attribute__((section(".bss.page_aligned"))); The problem is, you must only put variables into .bss.page_aligned that have a _size_ that is _exactly_ page aligned. In this case the size of p2m_top_mfn_list is not page aligned: 80b8d000 b p2m_top_mfn 80b8f000 b p2m_top_mfn_list 80b8f008 b softirq_stack 80b97008 b hardirq_stack 80b9f008 b bm_pte So all subsequent variables get unaligned which, depending on luck, breaks the kernel in various funny ways. In this case what killed the kernel first was the misaligned bootmap pte page, resulting in that creative crash above. Anyway, this was a fun bug to track down :-) I think the moral is that .bss.page_aligned is a dangerous construct in its current form, and the symptoms of breakage are very non-trivial, so i think we need build-time checks to make sure all symbols in .bss.page_aligned are truly page aligned. The Xen fix below gets the kernel booting again. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e959559..eef3b5c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -73,7 +73,8 @@ static unsigned long *p2m_top[TOP_ENTRIES] static unsigned long p2m_top_mfn[TOP_ENTRIES] __attribute__((section(".bss.page_aligned"))); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] +static unsigned long p2m_top_mfn_list[ + PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] __attribute__((section(".bss.page_aligned"))); static inline unsigned p2m_top_index(unsigned long pfn) -- cgit v1.1 From c7d624d1ee7b77622305bd638755394e4d3f2d2f Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Wed, 28 May 2008 12:57:13 -0400 Subject: x86: Fix up silly i1586 boot message. Trying to boot a 64-bit kernel on a 32bit Pentium 4 gets you an amusing message along the lines of. "you need an x86-64, but you only have an i1586" due to the P4 being family F. Munge it to be 686. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/boot/cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 00e19ed..92d6fd7 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -28,6 +28,8 @@ static char *cpu_name(int level) if (level == 64) { return "x86-64"; } else { + if (level == 15) + level = 6; sprintf(buf, "i%d86", level); return buf; } -- cgit v1.1 From 19ec673ced067316b9732bc6d1c4ff4052e5f795 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Wed, 28 May 2008 23:00:47 +0400 Subject: x86: nmi - fix incorrect NMI watchdog used by default The commit commit 4b82b277707a39b97271439c475f186f63ec4692 Author: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat May 24 19:36:35 2008 +0400 set nmi_watchdog to NMI_IO_APIC as by default. This causes hangs on some machines with buggy watchdogs. Fix it - i.e. restore old behaviour. Thanks to Sitsofe Wheeler and Adrian Bunk for catching the problem and Maciej W. Rozycki for explanation what is going on there. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> CC: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 16 +++++++++------- include/asm-x86/nmi.h | 4 +++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 69a839f..3671a9f 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -84,20 +84,15 @@ static inline unsigned int get_timer_irqs(int cpu) #endif } +#ifdef CONFIG_X86_64 /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; -#ifdef CONFIG_X86_64 nmi_watchdog = NMI_NONE; -#else - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -#endif } +#endif #ifdef CONFIG_SMP /* @@ -488,8 +483,15 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return -EIO; } +#ifdef CONFIG_X86_64 /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); +#else + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; +#endif if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1e8f34d..6f4d44f 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,9 +38,11 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); +extern void nmi_watchdog_default(void); +#else +#define nmi_watchdog_default(void) do {} while (0) #endif -extern void nmi_watchdog_default(void); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; -- cgit v1.1 From 0261ac5f2f43a1906cfacfb19d62ed643d162cbe Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 29 May 2008 09:31:50 +0200 Subject: xen: fix "xen: implement save/restore" -tip testing found the following build breakage: drivers/built-in.o: In function `xen_suspend': manage.c:(.text+0x4390f): undefined reference to `xen_console_resume' with this config: http://redhat.com/~mingo/misc/config-Thu_May_29_09_23_16_CEST_2008.bad i have bisected it down to: | commit 0e91398f2a5d4eb6b07df8115917d0d1cf3e9b58 | Author: Jeremy Fitzhardinge <jeremy@goop.org> | Date: Mon May 26 23:31:27 2008 +0100 | | xen: implement save/restore the problem is that drivers/xen/manage.c is built unconditionally if CONFIG_XEN is enabled and makes use of xen_suspend(), but drivers/char/hvc_xen.c, where the xen_suspend() method is implemented, is only build if CONFIG_HVC_XEN=y as well. i have solved this by providing a NOP implementation for xen_suspend() in the !CONFIG_HVC_XEN case. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/xen/hvc-console.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index fd5483a..98b79bc 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -3,7 +3,11 @@ extern struct console xenboot_console; +#ifdef CONFIG_HVC_XEN void xen_console_resume(void); +#else +static inline void xen_console_resume(void) { } +#endif void xen_raw_console_write(const char *str); void xen_raw_printk(const char *fmt, ...); -- cgit v1.1 From ba3a5974239293d921235e6fa82b09b670e674ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 30 May 2008 14:55:47 +0200 Subject: - fix typo in include/asm-x86/nmi.h --- include/asm-x86/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 6f4d44f..972a4f6 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -40,7 +40,7 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) extern void default_do_nmi(struct pt_regs *); extern void nmi_watchdog_default(void); #else -#define nmi_watchdog_default(void) do {} while (0) +#define nmi_watchdog_default() do { } while (0) #endif extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -- cgit v1.1 From d364319b989967b74e57fef5c8017fd56a16c392 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Fri, 2 May 2008 23:42:01 +0200 Subject: x86: move mmconfig declarations to header arch/x86/kernel/mmconf-fam10h_64.c is missing the prototypes, which are decalred in arch/x86/kernel/setup_64.c. Move the prototypes and the inline stubs to the appropriate header file. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/setup_64.c | 13 +------------ include/asm-x86/mmconfig.h | 12 ++++++++++++ 3 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 include/asm-x86/mmconfig.h diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index edc5fbf..fdfdc55 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -12,6 +12,7 @@ #include <asm/io.h> #include <asm/msr.h> #include <asm/acpi.h> +#include <asm/mmconfig.h> #include "../pci/pci.h" diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff128..341230d 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -71,6 +71,7 @@ #include <asm/topology.h> #include <asm/trampoline.h> #include <asm/pat.h> +#include <asm/mmconfig.h> #include <mach_apic.h> #ifdef CONFIG_PARAVIRT @@ -293,18 +294,6 @@ static void __init parse_setup_data(void) } } -#ifdef CONFIG_PCI_MMCONFIG -extern void __cpuinit fam10h_check_enable_mmcfg(void); -extern void __init check_enable_amd_mmconf_dmi(void); -#else -void __cpuinit fam10h_check_enable_mmcfg(void) -{ -} -void __init check_enable_amd_mmconf_dmi(void) -{ -} -#endif - /* * setup_arch - architecture-specific boot-time initializations * diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h new file mode 100644 index 0000000..95beda0 --- /dev/null +++ b/include/asm-x86/mmconfig.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MMCONFIG_H +#define _ASM_MMCONFIG_H + +#ifdef CONFIG_PCI_MMCONFIG +extern void __cpuinit fam10h_check_enable_mmcfg(void); +extern void __init check_enable_amd_mmconf_dmi(void); +#else +static inline void fam10h_check_enable_mmcfg(void) { } +static inline void check_enable_amd_mmconf_dmi(void) { } +#endif + +#endif -- cgit v1.1 From 4d285878564bb46cf64e54be18eeffe33ca583a0 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 22 May 2008 18:48:32 -0400 Subject: x86: Move the AMD64 specific parts out of setup_64.c Create a separate amd_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/amd_64.c | 235 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 232 ++---------------------------------------- 3 files changed, 242 insertions(+), 226 deletions(-) create mode 100644 arch/x86/kernel/cpu/amd_64.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a0c6f81..ef065c1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,6 +7,7 @@ obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o obj-$(CONFIG_X86_32) += amd.o +obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o obj-$(CONFIG_X86_32) += transmeta.o diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c new file mode 100644 index 0000000..1746f6f --- /dev/null +++ b/arch/x86/kernel/cpu/amd_64.c @@ -0,0 +1,235 @@ +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/numa_64.h> +#include <asm/mmconfig.h> +#include <asm/cacheflush.h> + +#include <mach_apic.h> + +extern int __cpuinit get_model_name(struct cpuinfo_x86 *c); +extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c); + +int force_mwait __cpuinitdata; + +#ifdef CONFIG_NUMA +static int __cpuinit nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits; +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node = 0; + unsigned apicid = hard_smp_processor_id(); +#endif + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; + +#endif +} + +#define ENABLE_C1E_MASK 0x18000000 +#define CPUID_PROCESSOR_SIGNATURE 1 +#define CPUID_XFAM 0x0ff00000 +#define CPUID_XFAM_K8 0x00000000 +#define CPUID_XFAM_10H 0x00100000 +#define CPUID_XFAM_11H 0x00200000 +#define CPUID_XMOD 0x000f0000 +#define CPUID_XMOD_REV_F 0x00040000 + +/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ +static __cpuinit int amd_apic_timer_broken(void) +{ + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + + switch (eax & CPUID_XFAM) { + case CPUID_XFAM_K8: + if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) + break; + case CPUID_XFAM_10H: + case CPUID_XFAM_11H: + rdmsr(MSR_K8_ENABLE_C1E, lo, hi); + if (lo & ENABLE_C1E_MASK) + return 1; + break; + default: + /* err on the side of caution */ + return 1; + } + return 0; +} + +void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +void __cpuinit init_amd(struct cpuinfo_x86 *c) +{ + unsigned level; + +#ifdef CONFIG_SMP + unsigned long value; + + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 15) { + rdmsrl(MSR_K8_HWCR, value); + value |= 1 << 6; + wrmsrl(MSR_K8_HWCR, value); + } +#endif + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_cpu_cap(c, 0*32+31); + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + if (c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + + level = get_model_name(c); + if (!level) { + switch (c->x86) { + case 15: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } + display_cacheinfo(c); + + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) + amd_detect_cmp(c); + + if (c->extended_cpuid_level >= 0x80000006 && + (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + + if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_K8); + + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + + if (c->x86 == 0x10) + fam10h_check_enable_mmcfg(); + + if (amd_apic_timer_broken()) + disable_apic_timer = 1; + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && + (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + set_memory_4k((unsigned long)__va(tseg), 1); + } +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 341230d..b07b199 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -96,8 +96,6 @@ int bootloader_type; unsigned long saved_video_mode; -int force_mwait __cpuinitdata; - /* * Early DMI memory */ @@ -526,7 +524,7 @@ void __init setup_arch(char **cmdline_p) check_enable_amd_mmconf_dmi(); } -static int __cpuinit get_model_name(struct cpuinfo_x86 *c) +int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -542,7 +540,7 @@ static int __cpuinit get_model_name(struct cpuinfo_x86 *c) } -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; @@ -574,228 +572,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) } } -#ifdef CONFIG_NUMA -static int __cpuinit nearby_node(int apicid) -{ - int i, node; - - for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif - bits = c->x86_coreid_bits; - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - - int ht_nodeid = c->initial_apicid; - - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -#endif -} - -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; - -#endif -} - -#define ENABLE_C1E_MASK 0x18000000 -#define CPUID_PROCESSOR_SIGNATURE 1 -#define CPUID_XFAM 0x0ff00000 -#define CPUID_XFAM_K8 0x00000000 -#define CPUID_XFAM_10H 0x00100000 -#define CPUID_XFAM_11H 0x00200000 -#define CPUID_XMOD 0x000f0000 -#define CPUID_XMOD_REV_F 0x00040000 - -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(void) -{ - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - - switch (eax & CPUID_XFAM) { - case CPUID_XFAM_K8: - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) - break; - case CPUID_XFAM_10H: - case CPUID_XFAM_11H: - rdmsr(MSR_K8_ENABLE_C1E, lo, hi); - if (lo & ENABLE_C1E_MASK) - return 1; - break; - default: - /* err on the side of caution */ - return 1; - } - return 0; -} - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) -{ - early_init_amd_mc(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ - unsigned level; - -#ifdef CONFIG_SMP - unsigned long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 15) { - rdmsrl(MSR_K8_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); - } -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_cpu_cap(c, 0*32+31); - - /* On C+ stepping K8 rep microcode works well for copy/memset */ - level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || - level >= 0x0f58)) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - if (c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - level = get_model_name(c); - if (!level) { - switch (c->x86) { - case 15: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) - amd_detect_cmp(c); - - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_K8); - - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - - if (c->x86 == 0x10) - fam10h_check_enable_mmcfg(); - - if (amd_apic_timer_broken()) - disable_apic_timer = 1; - - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) - set_memory_4k((unsigned long)__va(tseg), 1); - } -} - void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -977,6 +753,10 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } +// FIXME: Needs to use cpu_vendor_dev_register +extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); +extern void __cpuinit init_amd(struct cpuinfo_x86 *c); + /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -- cgit v1.1 From a82fbe31cb387bb246e2d3b3c177f551bb991135 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 22 May 2008 18:54:32 -0400 Subject: x86: Move the 64-bit Intel specific parts out of setup_64.c Create a separate intel_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/intel_64.c | 97 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 93 +--------------------------------------- 3 files changed, 100 insertions(+), 91 deletions(-) create mode 100644 arch/x86/kernel/cpu/intel_64.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ef065c1..b7a1192 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o obj-$(CONFIG_X86_32) += transmeta.o obj-$(CONFIG_X86_32) += intel.o +obj-$(CONFIG_X86_64) += intel_64.o obj-$(CONFIG_X86_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c new file mode 100644 index 0000000..e5f929f --- /dev/null +++ b/arch/x86/kernel/cpu/intel_64.c @@ -0,0 +1,97 @@ +#include <linux/init.h> +#include <linux/smp.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/topology.h> +#include <asm/numa_64.h> + +void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +/* + * find out the number of processor cores on the die + */ +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, t; + + if (c->cpuid_level < 4) + return 1; + + cpuid_count(4, 0, &eax, &t, &t, &t); + + if (eax & 0x1f) + return ((eax >> 26) + 1); + else + return 1; +} + +static void __cpuinit srat_detect_node(void) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE || !node_online(node)) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + +void __cpuinit init_intel(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (cpu_has_ds) { + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + } + + + if (cpu_has_bts) + ds_init_intel(c); + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; + } + + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); + + srat_detect_node(); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index b07b199..c4e6a0b 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -622,97 +622,6 @@ out: #endif } -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - - if (c->cpuid_level < 4) - return 1; - - cpuid_count(4, 0, &eax, &t, &t, &t); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit srat_detect_node(void) -{ -#ifdef CONFIG_NUMA - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) -{ - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - } - - if (cpu_has_ds) { - unsigned int l1, l2; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - } - - - if (cpu_has_bts) - ds_init_intel(c); - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - /* CPUID workaround for Intel 0F34 CPU */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 0xF && c->x86_model == 0x3 && - c->x86_mask == 0x4) - c->x86_phys_bits = 36; - } - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - c->x86_max_cores = intel_num_cpu_cores(c); - - srat_detect_node(); -} - static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 0x6 && c->x86_model >= 0xf) @@ -756,6 +665,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) // FIXME: Needs to use cpu_vendor_dev_register extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); extern void __cpuinit init_amd(struct cpuinfo_x86 *c); +extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c); +extern void __cpuinit init_intel(struct cpuinfo_x86 *c); /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu -- cgit v1.1 From 7e2191127eb414d7d5a11df6552ab6e3845d17a1 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 22 May 2008 18:55:06 -0400 Subject: x86: Remove workaround for prescott (32bit P4) from 64-bit code. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/intel_64.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index e5f929f..b339121 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -79,11 +79,6 @@ void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; - /* CPUID workaround for Intel 0F34 CPU */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 0xF && c->x86_model == 0x3 && - c->x86_mask == 0x4) - c->x86_phys_bits = 36; } if (c->x86 == 15) -- cgit v1.1 From 30a713180b3d08fdec5ca572e5a1cd35253c5d8e Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 22 May 2008 18:57:25 -0400 Subject: x86: Move the 64-bit Centaur specific parts out of setup_64.c Create a separate centaur_64.c file in the cpu/ dir for the useful parts to live in. Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/centaur_64.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 28 ++-------------------------- 3 files changed, 34 insertions(+), 26 deletions(-) create mode 100644 arch/x86/kernel/cpu/centaur_64.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b7a1192..c77a1c5 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o +obj-$(CONFIG_X86_64) += centaur_64.o obj-$(CONFIG_X86_32) += transmeta.o obj-$(CONFIG_X86_32) += intel.o obj-$(CONFIG_X86_64) += intel_64.o diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c new file mode 100644 index 0000000..bac96d1 --- /dev/null +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -0,0 +1,31 @@ +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + if (c->x86 == 0x6 && c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + +void __cpuinit init_centaur(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } + + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c4e6a0b..25afdd8 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -622,32 +622,6 @@ out: #endif } -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x6 && c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); -} - -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - if (c->x86 == 0x6 && c->x86_model >= 0xf) { - c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -} - static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -667,6 +641,8 @@ extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); extern void __cpuinit init_amd(struct cpuinfo_x86 *c); extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c); extern void __cpuinit init_intel(struct cpuinfo_x86 *c); +extern void __cpuinit early_init_centaur(struct cpuinfo_x86 *c); +extern void __cpuinit init_centaur(struct cpuinfo_x86 *c); /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu -- cgit v1.1 From 1c47cd638e8302bc38be1f6d81067950e038ebd3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 30 May 2008 15:42:45 -0700 Subject: x86: fix overlong line in arch/x86/kernel/cpu/amd_64.c Clean up an overlong line in arch/x86/kernel/cpu/amd_64.c. Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/cpu/amd_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 1746f6f..c815c2c 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -229,7 +229,8 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) && - (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) + (tseg >> PMD_SHIFT) < + (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT))) set_memory_4k((unsigned long)__va(tseg), 1); } } -- cgit v1.1 From 23968f71b26ece45ed52895d41b0208b90a516e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@redhat.com> Date: Thu, 29 May 2008 18:31:14 -0400 Subject: x86: Use structs instead of hardcoded offsets in x86 boot decompressor. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace hardcoded offsets embedded in macros in arch/x86/boot/compressed with proper structure references. Signed-off-by: Kristian Høgsberg <krh@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/boot/compressed/misc.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 90456ce..74ed3c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -30,6 +30,7 @@ #include <asm/io.h> #include <asm/page.h> #include <asm/boot.h> +#include <asm/bootparam.h> /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -187,13 +188,7 @@ static void gzip_release(void **); /* * This is set up by the setup-routine at boot-time */ -static unsigned char *real_mode; /* Pointer to real-mode data */ - -#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) -#ifndef STANDARD_MEMORY_BIOS_CALL -#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) -#endif -#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) +static struct boot_params *real_mode; /* Pointer to real-mode data */ extern unsigned char input_data[]; extern int input_len; @@ -276,12 +271,13 @@ static void putstr(const char *s) char c; #ifdef CONFIG_X86_32 - if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) + if (real_mode->screen_info.orig_video_mode == 0 && + lines == 0 && cols == 0) return; #endif - x = RM_SCREEN_INFO.orig_x; - y = RM_SCREEN_INFO.orig_y; + x = real_mode->screen_info.orig_x; + y = real_mode->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -302,8 +298,8 @@ static void putstr(const char *s) } } - RM_SCREEN_INFO.orig_x = x; - RM_SCREEN_INFO.orig_y = y; + real_mode->screen_info.orig_x = x; + real_mode->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -430,7 +426,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; - if (RM_SCREEN_INFO.orig_video_mode == 7) { + if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -438,8 +434,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = RM_SCREEN_INFO.orig_video_lines; - cols = RM_SCREEN_INFO.orig_video_cols; + lines = real_mode->screen_info.orig_video_lines; + cols = real_mode->screen_info.orig_video_cols; window = output; /* Output buffer (Normally at 1M) */ free_mem_ptr = heap; /* Heap */ -- cgit v1.1 From 3b6b9293d0f8e1b11630102013ca2a1dcef17d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= <krh@redhat.com> Date: Thu, 29 May 2008 18:31:15 -0400 Subject: x86: Honor 'quiet' command line option in real mode boot decompressor. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch lets the early real mode code look for the 'quiet' option on the kernel command line and pass a loadflag to the decompressor. When this flag is set, we suppress the "Decompressing Linux... Parsing ELF... done." messages. Signed-off-by: Kristian Høgsberg <krh@redhat.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/boot/compressed/misc.c | 13 ++++++++++--- arch/x86/boot/main.c | 4 ++++ include/asm-x86/bootparam.h | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 74ed3c0..d10e727 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -189,6 +189,7 @@ static void gzip_release(void **); * This is set up by the setup-routine at boot-time */ static struct boot_params *real_mode; /* Pointer to real-mode data */ +static int quiet; extern unsigned char input_data[]; extern int input_len; @@ -391,7 +392,8 @@ static void parse_elf(void *output) return; } - putstr("Parsing ELF... "); + if (!quiet) + putstr("Parsing ELF... "); phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum); if (!phdrs) @@ -426,6 +428,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; + if (real_mode->hdr.loadflags & QUIET_FLAG) + quiet = 1; + if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; @@ -461,9 +466,11 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, #endif makecrc(); - putstr("\nDecompressing Linux... "); + if (!quiet) + putstr("\nDecompressing Linux... "); gunzip(); parse_elf(output); - putstr("done.\nBooting the kernel.\n"); + if (!quiet) + putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 77569a4..2296164 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -165,6 +165,10 @@ void main(void) /* Set the video mode */ set_video(); + /* Parse command line for 'quiet' and pass it to decompressor. */ + if (cmdline_find_option_bool("quiet")) + boot_params.hdr.loadflags |= QUIET_FLAG; + /* Do the last things and invoke protected mode */ go_to_protected_mode(); } diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index f62f473..3e36ba8 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -40,6 +40,7 @@ struct setup_header { __u8 type_of_loader; __u8 loadflags; #define LOADED_HIGH (1<<0) +#define QUIET_FLAG (1<<5) #define KEEP_SEGMENTS (1<<6) #define CAN_USE_HEAP (1<<7) __u16 setup_move_size; -- cgit v1.1 From 4039feb5bae72a5fed9ba6bc1a9cfd8dfe0a8613 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 30 May 2008 17:16:20 -0700 Subject: x86: update Documentation/i386/boot.txt Document QUIET_FLAG, correct the definition of several fields, make it clear this applies to the entire x86 architecture, not just i386. Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- Documentation/i386/boot.txt | 79 ++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 95ad15c..147bfe5 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt @@ -1,17 +1,14 @@ - THE LINUX/I386 BOOT PROTOCOL - ---------------------------- + THE LINUX/x86 BOOT PROTOCOL + --------------------------- - H. Peter Anvin <hpa@zytor.com> - Last update 2007-05-23 - -On the i386 platform, the Linux kernel uses a rather complicated boot +On the x86 platform, the Linux kernel uses a rather complicated boot convention. This has evolved partially due to historical aspects, as well as the desire in the early days to have the kernel itself be a bootable image, the complicated PC memory model and due to changed expectations in the PC industry caused by the effective demise of real-mode DOS as a mainstream operating system. -Currently, the following versions of the Linux/i386 boot protocol exist. +Currently, the following versions of the Linux/x86 boot protocol exist. Old kernels: zImage/Image support only. Some very early kernels may not even support a command line. @@ -372,10 +369,17 @@ Protocol: 2.00+ - If 0, the protected-mode code is loaded at 0x10000. - If 1, the protected-mode code is loaded at 0x100000. + Bit 5 (write): QUIET_FLAG + - If 0, print early messages. + - If 1, suppress early messages. + This requests to the kernel (decompressor and early + kernel) to not write early messages that require + accessing the display hardware directly. + Bit 6 (write): KEEP_SEGMENTS Protocol: 2.07+ - - if 0, reload the segment registers in the 32bit entry point. - - if 1, do not reload the segment registers in the 32bit entry point. + - If 0, reload the segment registers in the 32bit entry point. + - If 1, do not reload the segment registers in the 32bit entry point. Assume that %cs %ds %ss %es are all set to flat segments with a base of 0 (or the equivalent for their environment). @@ -504,7 +508,7 @@ Protocol: 2.06+ maximum size was 255. Field name: hardware_subarch -Type: write +Type: write (optional, defaults to x86/PC) Offset/size: 0x23c/4 Protocol: 2.07+ @@ -520,11 +524,13 @@ Protocol: 2.07+ 0x00000002 Xen Field name: hardware_subarch_data -Type: write +Type: write (subarch-dependent) Offset/size: 0x240/8 Protocol: 2.07+ A pointer to data that is specific to hardware subarch + This field is currently unused for the default x86/PC environment, + do not modify. Field name: payload_offset Type: read @@ -545,6 +551,34 @@ Protocol: 2.08+ The length of the payload. +Field name: setup_data +Type: write (special) +Offset/size: 0x250/8 +Protocol: 2.09+ + + The 64-bit physical pointer to NULL terminated single linked list of + struct setup_data. This is used to define a more extensible boot + parameters passing mechanism. The definition of struct setup_data is + as follow: + + struct setup_data { + u64 next; + u32 type; + u32 len; + u8 data[0]; + }; + + Where, the next is a 64-bit physical pointer to the next node of + linked list, the next field of the last node is 0; the type is used + to identify the contents of data; the len is the length of data + field; the data holds the real payload. + + This list may be modified at a number of points during the bootup + process. Therefore, when modifying this list one should always make + sure to consider the case where the linked list already contains + entries. + + **** THE IMAGE CHECKSUM From boot protocol version 2.08 onwards the CRC-32 is calculated over @@ -553,6 +587,7 @@ initial remainder of 0xffffffff. The checksum is appended to the file; therefore the CRC of the file up to the limit specified in the syssize field of the header is always 0. + **** THE KERNEL COMMAND LINE The kernel command line has become an important way for the boot @@ -584,28 +619,6 @@ command line is entered using the following protocol: covered by setup_move_size, so you may need to adjust this field. -Field name: setup_data -Type: write (obligatory) -Offset/size: 0x250/8 -Protocol: 2.09+ - - The 64-bit physical pointer to NULL terminated single linked list of - struct setup_data. This is used to define a more extensible boot - parameters passing mechanism. The definition of struct setup_data is - as follow: - - struct setup_data { - u64 next; - u32 type; - u32 len; - u8 data[0]; - }; - - Where, the next is a 64-bit physical pointer to the next node of - linked list, the next field of the last node is 0; the type is used - to identify the contents of data; the len is the length of data - field; the data holds the real payload. - **** MEMORY LAYOUT OF THE REAL-MODE CODE -- cgit v1.1 From 23deb06821442506615f34bd92ccd6a2422629d7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 30 May 2008 17:19:03 -0700 Subject: x86: move x86-specific documentation into Documentation/x86 The current organization of the x86 documentation makes it appear as if the "i386" documentation doesn't apply to x86-64, which is does. Thus, move that documentation into Documentation/x86, and move the x86-64-specific stuff into Documentation/x86/x86_64 with the eventual goal to move stuff that isn't actually 64-bit specific back into Documentation/x86. Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- Documentation/i386/IO-APIC.txt | 119 ---- Documentation/i386/boot.txt | 900 ------------------------- Documentation/i386/usb-legacy-support.txt | 44 -- Documentation/i386/zero-page.txt | 31 - Documentation/x86/i386/IO-APIC.txt | 119 ++++ Documentation/x86/i386/boot.txt | 900 +++++++++++++++++++++++++ Documentation/x86/i386/usb-legacy-support.txt | 44 ++ Documentation/x86/i386/zero-page.txt | 31 + Documentation/x86/x86_64/00-INDEX | 16 + Documentation/x86/x86_64/boot-options.txt | 314 +++++++++ Documentation/x86/x86_64/cpu-hotplug-spec | 21 + Documentation/x86/x86_64/fake-numa-for-cpusets | 66 ++ Documentation/x86/x86_64/kernel-stacks | 99 +++ Documentation/x86/x86_64/machinecheck | 77 +++ Documentation/x86/x86_64/mm.txt | 29 + Documentation/x86/x86_64/uefi.txt | 38 ++ Documentation/x86_64/00-INDEX | 16 - Documentation/x86_64/boot-options.txt | 314 --------- Documentation/x86_64/cpu-hotplug-spec | 21 - Documentation/x86_64/fake-numa-for-cpusets | 66 -- Documentation/x86_64/kernel-stacks | 99 --- Documentation/x86_64/machinecheck | 77 --- Documentation/x86_64/mm.txt | 29 - Documentation/x86_64/uefi.txt | 38 -- 24 files changed, 1754 insertions(+), 1754 deletions(-) delete mode 100644 Documentation/i386/IO-APIC.txt delete mode 100644 Documentation/i386/boot.txt delete mode 100644 Documentation/i386/usb-legacy-support.txt delete mode 100644 Documentation/i386/zero-page.txt create mode 100644 Documentation/x86/i386/IO-APIC.txt create mode 100644 Documentation/x86/i386/boot.txt create mode 100644 Documentation/x86/i386/usb-legacy-support.txt create mode 100644 Documentation/x86/i386/zero-page.txt create mode 100644 Documentation/x86/x86_64/00-INDEX create mode 100644 Documentation/x86/x86_64/boot-options.txt create mode 100644 Documentation/x86/x86_64/cpu-hotplug-spec create mode 100644 Documentation/x86/x86_64/fake-numa-for-cpusets create mode 100644 Documentation/x86/x86_64/kernel-stacks create mode 100644 Documentation/x86/x86_64/machinecheck create mode 100644 Documentation/x86/x86_64/mm.txt create mode 100644 Documentation/x86/x86_64/uefi.txt delete mode 100644 Documentation/x86_64/00-INDEX delete mode 100644 Documentation/x86_64/boot-options.txt delete mode 100644 Documentation/x86_64/cpu-hotplug-spec delete mode 100644 Documentation/x86_64/fake-numa-for-cpusets delete mode 100644 Documentation/x86_64/kernel-stacks delete mode 100644 Documentation/x86_64/machinecheck delete mode 100644 Documentation/x86_64/mm.txt delete mode 100644 Documentation/x86_64/uefi.txt diff --git a/Documentation/i386/IO-APIC.txt b/Documentation/i386/IO-APIC.txt deleted file mode 100644 index 30b4c71..0000000 --- a/Documentation/i386/IO-APIC.txt +++ /dev/null @@ -1,119 +0,0 @@ -Most (all) Intel-MP compliant SMP boards have the so-called 'IO-APIC', -which is an enhanced interrupt controller. It enables us to route -hardware interrupts to multiple CPUs, or to CPU groups. Without an -IO-APIC, interrupts from hardware will be delivered only to the -CPU which boots the operating system (usually CPU#0). - -Linux supports all variants of compliant SMP boards, including ones with -multiple IO-APICs. Multiple IO-APICs are used in high-end servers to -distribute IRQ load further. - -There are (a few) known breakages in certain older boards, such bugs are -usually worked around by the kernel. If your MP-compliant SMP board does -not boot Linux, then consult the linux-smp mailing list archives first. - -If your box boots fine with enabled IO-APIC IRQs, then your -/proc/interrupts will look like this one: - - ----------------------------> - hell:~> cat /proc/interrupts - CPU0 - 0: 1360293 IO-APIC-edge timer - 1: 4 IO-APIC-edge keyboard - 2: 0 XT-PIC cascade - 13: 1 XT-PIC fpu - 14: 1448 IO-APIC-edge ide0 - 16: 28232 IO-APIC-level Intel EtherExpress Pro 10/100 Ethernet - 17: 51304 IO-APIC-level eth0 - NMI: 0 - ERR: 0 - hell:~> - <---------------------------- - -Some interrupts are still listed as 'XT PIC', but this is not a problem; -none of those IRQ sources is performance-critical. - - -In the unlikely case that your board does not create a working mp-table, -you can use the pirq= boot parameter to 'hand-construct' IRQ entries. This -is non-trivial though and cannot be automated. One sample /etc/lilo.conf -entry: - - append="pirq=15,11,10" - -The actual numbers depend on your system, on your PCI cards and on their -PCI slot position. Usually PCI slots are 'daisy chained' before they are -connected to the PCI chipset IRQ routing facility (the incoming PIRQ1-4 -lines): - - ,-. ,-. ,-. ,-. ,-. - PIRQ4 ----| |-. ,-| |-. ,-| |-. ,-| |--------| | - |S| \ / |S| \ / |S| \ / |S| |S| - PIRQ3 ----|l|-. `/---|l|-. `/---|l|-. `/---|l|--------|l| - |o| \/ |o| \/ |o| \/ |o| |o| - PIRQ2 ----|t|-./`----|t|-./`----|t|-./`----|t|--------|t| - |1| /\ |2| /\ |3| /\ |4| |5| - PIRQ1 ----| |- `----| |- `----| |- `----| |--------| | - `-' `-' `-' `-' `-' - -Every PCI card emits a PCI IRQ, which can be INTA, INTB, INTC or INTD: - - ,-. - INTD--| | - |S| - INTC--|l| - |o| - INTB--|t| - |x| - INTA--| | - `-' - -These INTA-D PCI IRQs are always 'local to the card', their real meaning -depends on which slot they are in. If you look at the daisy chaining diagram, -a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ4 of -the PCI chipset. Most cards issue INTA, this creates optimal distribution -between the PIRQ lines. (distributing IRQ sources properly is not a -necessity, PCI IRQs can be shared at will, but it's a good for performance -to have non shared interrupts). Slot5 should be used for videocards, they -do not use interrupts normally, thus they are not daisy chained either. - -so if you have your SCSI card (IRQ11) in Slot1, Tulip card (IRQ9) in -Slot2, then you'll have to specify this pirq= line: - - append="pirq=11,9" - -the following script tries to figure out such a default pirq= line from -your PCI configuration: - - echo -n pirq=; echo `scanpci | grep T_L | cut -c56-` | sed 's/ /,/g' - -note that this script wont work if you have skipped a few slots or if your -board does not do default daisy-chaining. (or the IO-APIC has the PIRQ pins -connected in some strange way). E.g. if in the above case you have your SCSI -card (IRQ11) in Slot3, and have Slot1 empty: - - append="pirq=0,9,11" - -[value '0' is a generic 'placeholder', reserved for empty (or non-IRQ emitting) -slots.] - -Generally, it's always possible to find out the correct pirq= settings, just -permute all IRQ numbers properly ... it will take some time though. An -'incorrect' pirq line will cause the booting process to hang, or a device -won't function properly (e.g. if it's inserted as a module). - -If you have 2 PCI buses, then you can use up to 8 pirq values, although such -boards tend to have a good configuration. - -Be prepared that it might happen that you need some strange pirq line: - - append="pirq=0,0,0,0,0,0,9,11" - -Use smart trial-and-error techniques to find out the correct pirq line ... - -Good luck and mail to linux-smp@vger.kernel.org or -linux-kernel@vger.kernel.org if you have any problems that are not covered -by this document. - --- mingo - diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt deleted file mode 100644 index 147bfe5..0000000 --- a/Documentation/i386/boot.txt +++ /dev/null @@ -1,900 +0,0 @@ - THE LINUX/x86 BOOT PROTOCOL - --------------------------- - -On the x86 platform, the Linux kernel uses a rather complicated boot -convention. This has evolved partially due to historical aspects, as -well as the desire in the early days to have the kernel itself be a -bootable image, the complicated PC memory model and due to changed -expectations in the PC industry caused by the effective demise of -real-mode DOS as a mainstream operating system. - -Currently, the following versions of the Linux/x86 boot protocol exist. - -Old kernels: zImage/Image support only. Some very early kernels - may not even support a command line. - -Protocol 2.00: (Kernel 1.3.73) Added bzImage and initrd support, as - well as a formalized way to communicate between the - boot loader and the kernel. setup.S made relocatable, - although the traditional setup area still assumed - writable. - -Protocol 2.01: (Kernel 1.3.76) Added a heap overrun warning. - -Protocol 2.02: (Kernel 2.4.0-test3-pre3) New command line protocol. - Lower the conventional memory ceiling. No overwrite - of the traditional setup area, thus making booting - safe for systems which use the EBDA from SMM or 32-bit - BIOS entry points. zImage deprecated but still - supported. - -Protocol 2.03: (Kernel 2.4.18-pre1) Explicitly makes the highest possible - initrd address available to the bootloader. - -Protocol 2.04: (Kernel 2.6.14) Extend the syssize field to four bytes. - -Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable. - Introduce relocatable_kernel and kernel_alignment fields. - -Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of - the boot command line. - -Protocol 2.07: (Kernel 2.6.24) Added paravirtualised boot protocol. - Introduced hardware_subarch and hardware_subarch_data - and KEEP_SEGMENTS flag in load_flags. - -Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format - payload. Introduced payload_offset and payload length - fields to aid in locating the payload. - -Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical - pointer to single linked list of struct setup_data. - -**** MEMORY LAYOUT - -The traditional memory map for the kernel loader, used for Image or -zImage kernels, typically looks like: - - | | -0A0000 +------------------------+ - | Reserved for BIOS | Do not use. Reserved for BIOS EBDA. -09A000 +------------------------+ - | Command line | - | Stack/heap | For use by the kernel real-mode code. -098000 +------------------------+ - | Kernel setup | The kernel real-mode code. -090200 +------------------------+ - | Kernel boot sector | The kernel legacy boot sector. -090000 +------------------------+ - | Protected-mode kernel | The bulk of the kernel image. -010000 +------------------------+ - | Boot loader | <- Boot sector entry point 0000:7C00 -001000 +------------------------+ - | Reserved for MBR/BIOS | -000800 +------------------------+ - | Typically used by MBR | -000600 +------------------------+ - | BIOS use only | -000000 +------------------------+ - - -When using bzImage, the protected-mode kernel was relocated to -0x100000 ("high memory"), and the kernel real-mode block (boot sector, -setup, and stack/heap) was made relocatable to any address between -0x10000 and end of low memory. Unfortunately, in protocols 2.00 and -2.01 the 0x90000+ memory range is still used internally by the kernel; -the 2.02 protocol resolves that problem. - -It is desirable to keep the "memory ceiling" -- the highest point in -low memory touched by the boot loader -- as low as possible, since -some newer BIOSes have begun to allocate some rather large amounts of -memory, called the Extended BIOS Data Area, near the top of low -memory. The boot loader should use the "INT 12h" BIOS call to verify -how much low memory is available. - -Unfortunately, if INT 12h reports that the amount of memory is too -low, there is usually nothing the boot loader can do but to report an -error to the user. The boot loader should therefore be designed to -take up as little space in low memory as it reasonably can. For -zImage or old bzImage kernels, which need data written into the -0x90000 segment, the boot loader should make sure not to use memory -above the 0x9A000 point; too many BIOSes will break above that point. - -For a modern bzImage kernel with boot protocol version >= 2.02, a -memory layout like the following is suggested: - - ~ ~ - | Protected-mode kernel | -100000 +------------------------+ - | I/O memory hole | -0A0000 +------------------------+ - | Reserved for BIOS | Leave as much as possible unused - ~ ~ - | Command line | (Can also be below the X+10000 mark) -X+10000 +------------------------+ - | Stack/heap | For use by the kernel real-mode code. -X+08000 +------------------------+ - | Kernel setup | The kernel real-mode code. - | Kernel boot sector | The kernel legacy boot sector. -X +------------------------+ - | Boot loader | <- Boot sector entry point 0000:7C00 -001000 +------------------------+ - | Reserved for MBR/BIOS | -000800 +------------------------+ - | Typically used by MBR | -000600 +------------------------+ - | BIOS use only | -000000 +------------------------+ - -... where the address X is as low as the design of the boot loader -permits. - - -**** THE REAL-MODE KERNEL HEADER - -In the following text, and anywhere in the kernel boot sequence, "a -sector" refers to 512 bytes. It is independent of the actual sector -size of the underlying medium. - -The first step in loading a Linux kernel should be to load the -real-mode code (boot sector and setup code) and then examine the -following header at offset 0x01f1. The real-mode code can total up to -32K, although the boot loader may choose to load only the first two -sectors (1K) and then examine the bootup sector size. - -The header looks like: - -Offset Proto Name Meaning -/Size - -01F1/1 ALL(1 setup_sects The size of the setup in sectors -01F2/2 ALL root_flags If set, the root is mounted readonly -01F4/4 2.04+(2 syssize The size of the 32-bit code in 16-byte paras -01F8/2 ALL ram_size DO NOT USE - for bootsect.S use only -01FA/2 ALL vid_mode Video mode control -01FC/2 ALL root_dev Default root device number -01FE/2 ALL boot_flag 0xAA55 magic number -0200/2 2.00+ jump Jump instruction -0202/4 2.00+ header Magic signature "HdrS" -0206/2 2.00+ version Boot protocol version supported -0208/4 2.00+ realmode_swtch Boot loader hook (see below) -020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) -020E/2 2.00+ kernel_version Pointer to kernel version string -0210/1 2.00+ type_of_loader Boot loader identifier -0211/1 2.00+ loadflags Boot protocol option flags -0212/2 2.00+ setup_move_size Move to high memory size (used with hooks) -0214/4 2.00+ code32_start Boot loader hook (see below) -0218/4 2.00+ ramdisk_image initrd load address (set by boot loader) -021C/4 2.00+ ramdisk_size initrd size (set by boot loader) -0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only -0224/2 2.01+ heap_end_ptr Free memory after setup end -0226/2 N/A pad1 Unused -0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line -022C/4 2.03+ initrd_addr_max Highest legal initrd address -0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel -0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not -0235/3 N/A pad2 Unused -0238/4 2.06+ cmdline_size Maximum size of the kernel command line -023C/4 2.07+ hardware_subarch Hardware subarchitecture -0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data -0248/4 2.08+ payload_offset Offset of kernel payload -024C/4 2.08+ payload_length Length of kernel payload -0250/8 2.09+ setup_data 64-bit physical pointer to linked list - of struct setup_data - -(1) For backwards compatibility, if the setup_sects field contains 0, the - real value is 4. - -(2) For boot protocol prior to 2.04, the upper two bytes of the syssize - field are unusable, which means the size of a bzImage kernel - cannot be determined. - -If the "HdrS" (0x53726448) magic number is not found at offset 0x202, -the boot protocol version is "old". Loading an old kernel, the -following parameters should be assumed: - - Image type = zImage - initrd not supported - Real-mode kernel must be located at 0x90000. - -Otherwise, the "version" field contains the protocol version, -e.g. protocol version 2.01 will contain 0x0201 in this field. When -setting fields in the header, you must make sure only to set fields -supported by the protocol version in use. - - -**** DETAILS OF HEADER FIELDS - -For each field, some are information from the kernel to the bootloader -("read"), some are expected to be filled out by the bootloader -("write"), and some are expected to be read and modified by the -bootloader ("modify"). - -All general purpose boot loaders should write the fields marked -(obligatory). Boot loaders who want to load the kernel at a -nonstandard address should fill in the fields marked (reloc); other -boot loaders can ignore those fields. - -The byte order of all fields is littleendian (this is x86, after all.) - -Field name: setup_sects -Type: read -Offset/size: 0x1f1/1 -Protocol: ALL - - The size of the setup code in 512-byte sectors. If this field is - 0, the real value is 4. The real-mode code consists of the boot - sector (always one 512-byte sector) plus the setup code. - -Field name: root_flags -Type: modify (optional) -Offset/size: 0x1f2/2 -Protocol: ALL - - If this field is nonzero, the root defaults to readonly. The use of - this field is deprecated; use the "ro" or "rw" options on the - command line instead. - -Field name: syssize -Type: read -Offset/size: 0x1f4/4 (protocol 2.04+) 0x1f4/2 (protocol ALL) -Protocol: 2.04+ - - The size of the protected-mode code in units of 16-byte paragraphs. - For protocol versions older than 2.04 this field is only two bytes - wide, and therefore cannot be trusted for the size of a kernel if - the LOAD_HIGH flag is set. - -Field name: ram_size -Type: kernel internal -Offset/size: 0x1f8/2 -Protocol: ALL - - This field is obsolete. - -Field name: vid_mode -Type: modify (obligatory) -Offset/size: 0x1fa/2 - - Please see the section on SPECIAL COMMAND LINE OPTIONS. - -Field name: root_dev -Type: modify (optional) -Offset/size: 0x1fc/2 -Protocol: ALL - - The default root device device number. The use of this field is - deprecated, use the "root=" option on the command line instead. - -Field name: boot_flag -Type: read -Offset/size: 0x1fe/2 -Protocol: ALL - - Contains 0xAA55. This is the closest thing old Linux kernels have - to a magic number. - -Field name: jump -Type: read -Offset/size: 0x200/2 -Protocol: 2.00+ - - Contains an x86 jump instruction, 0xEB followed by a signed offset - relative to byte 0x202. This can be used to determine the size of - the header. - -Field name: header -Type: read -Offset/size: 0x202/4 -Protocol: 2.00+ - - Contains the magic number "HdrS" (0x53726448). - -Field name: version -Type: read -Offset/size: 0x206/2 -Protocol: 2.00+ - - Contains the boot protocol version, in (major << 8)+minor format, - e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version - 10.17. - -Field name: readmode_swtch -Type: modify (optional) -Offset/size: 0x208/4 -Protocol: 2.00+ - - Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) - -Field name: start_sys -Type: read -Offset/size: 0x20c/4 -Protocol: 2.00+ - - The load low segment (0x1000). Obsolete. - -Field name: kernel_version -Type: read -Offset/size: 0x20e/2 -Protocol: 2.00+ - - If set to a nonzero value, contains a pointer to a NUL-terminated - human-readable kernel version number string, less 0x200. This can - be used to display the kernel version to the user. This value - should be less than (0x200*setup_sects). - - For example, if this value is set to 0x1c00, the kernel version - number string can be found at offset 0x1e00 in the kernel file. - This is a valid value if and only if the "setup_sects" field - contains the value 15 or higher, as: - - 0x1c00 < 15*0x200 (= 0x1e00) but - 0x1c00 >= 14*0x200 (= 0x1c00) - - 0x1c00 >> 9 = 14, so the minimum value for setup_secs is 15. - -Field name: type_of_loader -Type: write (obligatory) -Offset/size: 0x210/1 -Protocol: 2.00+ - - If your boot loader has an assigned id (see table below), enter - 0xTV here, where T is an identifier for the boot loader and V is - a version number. Otherwise, enter 0xFF here. - - Assigned boot loader ids: - 0 LILO (0x00 reserved for pre-2.00 bootloader) - 1 Loadlin - 2 bootsect-loader (0x20, all other values reserved) - 3 SYSLINUX - 4 EtherBoot - 5 ELILO - 7 GRuB - 8 U-BOOT - 9 Xen - A Gujin - B Qemu - - Please contact <hpa@zytor.com> if you need a bootloader ID - value assigned. - -Field name: loadflags -Type: modify (obligatory) -Offset/size: 0x211/1 -Protocol: 2.00+ - - This field is a bitmask. - - Bit 0 (read): LOADED_HIGH - - If 0, the protected-mode code is loaded at 0x10000. - - If 1, the protected-mode code is loaded at 0x100000. - - Bit 5 (write): QUIET_FLAG - - If 0, print early messages. - - If 1, suppress early messages. - This requests to the kernel (decompressor and early - kernel) to not write early messages that require - accessing the display hardware directly. - - Bit 6 (write): KEEP_SEGMENTS - Protocol: 2.07+ - - If 0, reload the segment registers in the 32bit entry point. - - If 1, do not reload the segment registers in the 32bit entry point. - Assume that %cs %ds %ss %es are all set to flat segments with - a base of 0 (or the equivalent for their environment). - - Bit 7 (write): CAN_USE_HEAP - Set this bit to 1 to indicate that the value entered in the - heap_end_ptr is valid. If this field is clear, some setup code - functionality will be disabled. - -Field name: setup_move_size -Type: modify (obligatory) -Offset/size: 0x212/2 -Protocol: 2.00-2.01 - - When using protocol 2.00 or 2.01, if the real mode kernel is not - loaded at 0x90000, it gets moved there later in the loading - sequence. Fill in this field if you want additional data (such as - the kernel command line) moved in addition to the real-mode kernel - itself. - - The unit is bytes starting with the beginning of the boot sector. - - This field is can be ignored when the protocol is 2.02 or higher, or - if the real-mode code is loaded at 0x90000. - -Field name: code32_start -Type: modify (optional, reloc) -Offset/size: 0x214/4 -Protocol: 2.00+ - - The address to jump to in protected mode. This defaults to the load - address of the kernel, and can be used by the boot loader to - determine the proper load address. - - This field can be modified for two purposes: - - 1. as a boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) - - 2. if a bootloader which does not install a hook loads a - relocatable kernel at a nonstandard address it will have to modify - this field to point to the load address. - -Field name: ramdisk_image -Type: write (obligatory) -Offset/size: 0x218/4 -Protocol: 2.00+ - - The 32-bit linear address of the initial ramdisk or ramfs. Leave at - zero if there is no initial ramdisk/ramfs. - -Field name: ramdisk_size -Type: write (obligatory) -Offset/size: 0x21c/4 -Protocol: 2.00+ - - Size of the initial ramdisk or ramfs. Leave at zero if there is no - initial ramdisk/ramfs. - -Field name: bootsect_kludge -Type: kernel internal -Offset/size: 0x220/4 -Protocol: 2.00+ - - This field is obsolete. - -Field name: heap_end_ptr -Type: write (obligatory) -Offset/size: 0x224/2 -Protocol: 2.01+ - - Set this field to the offset (from the beginning of the real-mode - code) of the end of the setup stack/heap, minus 0x0200. - -Field name: cmd_line_ptr -Type: write (obligatory) -Offset/size: 0x228/4 -Protocol: 2.02+ - - Set this field to the linear address of the kernel command line. - The kernel command line can be located anywhere between the end of - the setup heap and 0xA0000; it does not have to be located in the - same 64K segment as the real-mode code itself. - - Fill in this field even if your boot loader does not support a - command line, in which case you can point this to an empty string - (or better yet, to the string "auto".) If this field is left at - zero, the kernel will assume that your boot loader does not support - the 2.02+ protocol. - -Field name: initrd_addr_max -Type: read -Offset/size: 0x22c/4 -Protocol: 2.03+ - - The maximum address that may be occupied by the initial - ramdisk/ramfs contents. For boot protocols 2.02 or earlier, this - field is not present, and the maximum address is 0x37FFFFFF. (This - address is defined as the address of the highest safe byte, so if - your ramdisk is exactly 131072 bytes long and this field is - 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) - -Field name: kernel_alignment -Type: read (reloc) -Offset/size: 0x230/4 -Protocol: 2.05+ - - Alignment unit required by the kernel (if relocatable_kernel is true.) - -Field name: relocatable_kernel -Type: read (reloc) -Offset/size: 0x234/1 -Protocol: 2.05+ - - If this field is nonzero, the protected-mode part of the kernel can - be loaded at any address that satisfies the kernel_alignment field. - After loading, the boot loader must set the code32_start field to - point to the loaded code, or to a boot loader hook. - -Field name: cmdline_size -Type: read -Offset/size: 0x238/4 -Protocol: 2.06+ - - The maximum size of the command line without the terminating - zero. This means that the command line can contain at most - cmdline_size characters. With protocol version 2.05 and earlier, the - maximum size was 255. - -Field name: hardware_subarch -Type: write (optional, defaults to x86/PC) -Offset/size: 0x23c/4 -Protocol: 2.07+ - - In a paravirtualized environment the hardware low level architectural - pieces such as interrupt handling, page table handling, and - accessing process control registers needs to be done differently. - - This field allows the bootloader to inform the kernel we are in one - one of those environments. - - 0x00000000 The default x86/PC environment - 0x00000001 lguest - 0x00000002 Xen - -Field name: hardware_subarch_data -Type: write (subarch-dependent) -Offset/size: 0x240/8 -Protocol: 2.07+ - - A pointer to data that is specific to hardware subarch - This field is currently unused for the default x86/PC environment, - do not modify. - -Field name: payload_offset -Type: read -Offset/size: 0x248/4 -Protocol: 2.08+ - - If non-zero then this field contains the offset from the end of the - real-mode code to the payload. - - The payload may be compressed. The format of both the compressed and - uncompressed data should be determined using the standard magic - numbers. Currently only gzip compressed ELF is used. - -Field name: payload_length -Type: read -Offset/size: 0x24c/4 -Protocol: 2.08+ - - The length of the payload. - -Field name: setup_data -Type: write (special) -Offset/size: 0x250/8 -Protocol: 2.09+ - - The 64-bit physical pointer to NULL terminated single linked list of - struct setup_data. This is used to define a more extensible boot - parameters passing mechanism. The definition of struct setup_data is - as follow: - - struct setup_data { - u64 next; - u32 type; - u32 len; - u8 data[0]; - }; - - Where, the next is a 64-bit physical pointer to the next node of - linked list, the next field of the last node is 0; the type is used - to identify the contents of data; the len is the length of data - field; the data holds the real payload. - - This list may be modified at a number of points during the bootup - process. Therefore, when modifying this list one should always make - sure to consider the case where the linked list already contains - entries. - - -**** THE IMAGE CHECKSUM - -From boot protocol version 2.08 onwards the CRC-32 is calculated over -the entire file using the characteristic polynomial 0x04C11DB7 and an -initial remainder of 0xffffffff. The checksum is appended to the -file; therefore the CRC of the file up to the limit specified in the -syssize field of the header is always 0. - - -**** THE KERNEL COMMAND LINE - -The kernel command line has become an important way for the boot -loader to communicate with the kernel. Some of its options are also -relevant to the boot loader itself, see "special command line options" -below. - -The kernel command line is a null-terminated string. The maximum -length can be retrieved from the field cmdline_size. Before protocol -version 2.06, the maximum was 255 characters. A string that is too -long will be automatically truncated by the kernel. - -If the boot protocol version is 2.02 or later, the address of the -kernel command line is given by the header field cmd_line_ptr (see -above.) This address can be anywhere between the end of the setup -heap and 0xA0000. - -If the protocol version is *not* 2.02 or higher, the kernel -command line is entered using the following protocol: - - At offset 0x0020 (word), "cmd_line_magic", enter the magic - number 0xA33F. - - At offset 0x0022 (word), "cmd_line_offset", enter the offset - of the kernel command line (relative to the start of the - real-mode kernel). - - The kernel command line *must* be within the memory region - covered by setup_move_size, so you may need to adjust this - field. - - -**** MEMORY LAYOUT OF THE REAL-MODE CODE - -The real-mode code requires a stack/heap to be set up, as well as -memory allocated for the kernel command line. This needs to be done -in the real-mode accessible memory in bottom megabyte. - -It should be noted that modern machines often have a sizable Extended -BIOS Data Area (EBDA). As a result, it is advisable to use as little -of the low megabyte as possible. - -Unfortunately, under the following circumstances the 0x90000 memory -segment has to be used: - - - When loading a zImage kernel ((loadflags & 0x01) == 0). - - When loading a 2.01 or earlier boot protocol kernel. - - -> For the 2.00 and 2.01 boot protocols, the real-mode code - can be loaded at another address, but it is internally - relocated to 0x90000. For the "old" protocol, the - real-mode code must be loaded at 0x90000. - -When loading at 0x90000, avoid using memory above 0x9a000. - -For boot protocol 2.02 or higher, the command line does not have to be -located in the same 64K segment as the real-mode setup code; it is -thus permitted to give the stack/heap the full 64K segment and locate -the command line above it. - -The kernel command line should not be located below the real-mode -code, nor should it be located in high memory. - - -**** SAMPLE BOOT CONFIGURATION - -As a sample configuration, assume the following layout of the real -mode segment: - - When loading below 0x90000, use the entire segment: - - 0x0000-0x7fff Real mode kernel - 0x8000-0xdfff Stack and heap - 0xe000-0xffff Kernel command line - - When loading at 0x90000 OR the protocol version is 2.01 or earlier: - - 0x0000-0x7fff Real mode kernel - 0x8000-0x97ff Stack and heap - 0x9800-0x9fff Kernel command line - -Such a boot loader should enter the following fields in the header: - - unsigned long base_ptr; /* base address for real-mode segment */ - - if ( setup_sects == 0 ) { - setup_sects = 4; - } - - if ( protocol >= 0x0200 ) { - type_of_loader = <type code>; - if ( loading_initrd ) { - ramdisk_image = <initrd_address>; - ramdisk_size = <initrd_size>; - } - - if ( protocol >= 0x0202 && loadflags & 0x01 ) - heap_end = 0xe000; - else - heap_end = 0x9800; - - if ( protocol >= 0x0201 ) { - heap_end_ptr = heap_end - 0x200; - loadflags |= 0x80; /* CAN_USE_HEAP */ - } - - if ( protocol >= 0x0202 ) { - cmd_line_ptr = base_ptr + heap_end; - strcpy(cmd_line_ptr, cmdline); - } else { - cmd_line_magic = 0xA33F; - cmd_line_offset = heap_end; - setup_move_size = heap_end + strlen(cmdline)+1; - strcpy(base_ptr+cmd_line_offset, cmdline); - } - } else { - /* Very old kernel */ - - heap_end = 0x9800; - - cmd_line_magic = 0xA33F; - cmd_line_offset = heap_end; - - /* A very old kernel MUST have its real-mode code - loaded at 0x90000 */ - - if ( base_ptr != 0x90000 ) { - /* Copy the real-mode kernel */ - memcpy(0x90000, base_ptr, (setup_sects+1)*512); - base_ptr = 0x90000; /* Relocated */ - } - - strcpy(0x90000+cmd_line_offset, cmdline); - - /* It is recommended to clear memory up to the 32K mark */ - memset(0x90000 + (setup_sects+1)*512, 0, - (64-(setup_sects+1))*512); - } - - -**** LOADING THE REST OF THE KERNEL - -The 32-bit (non-real-mode) kernel starts at offset (setup_sects+1)*512 -in the kernel file (again, if setup_sects == 0 the real value is 4.) -It should be loaded at address 0x10000 for Image/zImage kernels and -0x100000 for bzImage kernels. - -The kernel is a bzImage kernel if the protocol >= 2.00 and the 0x01 -bit (LOAD_HIGH) in the loadflags field is set: - - is_bzImage = (protocol >= 0x0200) && (loadflags & 0x01); - load_address = is_bzImage ? 0x100000 : 0x10000; - -Note that Image/zImage kernels can be up to 512K in size, and thus use -the entire 0x10000-0x90000 range of memory. This means it is pretty -much a requirement for these kernels to load the real-mode part at -0x90000. bzImage kernels allow much more flexibility. - - -**** SPECIAL COMMAND LINE OPTIONS - -If the command line provided by the boot loader is entered by the -user, the user may expect the following command line options to work. -They should normally not be deleted from the kernel command line even -though not all of them are actually meaningful to the kernel. Boot -loader authors who need additional command line options for the boot -loader itself should get them registered in -Documentation/kernel-parameters.txt to make sure they will not -conflict with actual kernel options now or in the future. - - vga=<mode> - <mode> here is either an integer (in C notation, either - decimal, octal, or hexadecimal) or one of the strings - "normal" (meaning 0xFFFF), "ext" (meaning 0xFFFE) or "ask" - (meaning 0xFFFD). This value should be entered into the - vid_mode field, as it is used by the kernel before the command - line is parsed. - - mem=<size> - <size> is an integer in C notation optionally followed by - (case insensitive) K, M, G, T, P or E (meaning << 10, << 20, - << 30, << 40, << 50 or << 60). This specifies the end of - memory to the kernel. This affects the possible placement of - an initrd, since an initrd should be placed near end of - memory. Note that this is an option to *both* the kernel and - the bootloader! - - initrd=<file> - An initrd should be loaded. The meaning of <file> is - obviously bootloader-dependent, and some boot loaders - (e.g. LILO) do not have such a command. - -In addition, some boot loaders add the following options to the -user-specified command line: - - BOOT_IMAGE=<file> - The boot image which was loaded. Again, the meaning of <file> - is obviously bootloader-dependent. - - auto - The kernel was booted without explicit user intervention. - -If these options are added by the boot loader, it is highly -recommended that they are located *first*, before the user-specified -or configuration-specified command line. Otherwise, "init=/bin/sh" -gets confused by the "auto" option. - - -**** RUNNING THE KERNEL - -The kernel is started by jumping to the kernel entry point, which is -located at *segment* offset 0x20 from the start of the real mode -kernel. This means that if you loaded your real-mode kernel code at -0x90000, the kernel entry point is 9020:0000. - -At entry, ds = es = ss should point to the start of the real-mode -kernel code (0x9000 if the code is loaded at 0x90000), sp should be -set up properly, normally pointing to the top of the heap, and -interrupts should be disabled. Furthermore, to guard against bugs in -the kernel, it is recommended that the boot loader sets fs = gs = ds = -es = ss. - -In our example from above, we would do: - - /* Note: in the case of the "old" kernel protocol, base_ptr must - be == 0x90000 at this point; see the previous sample code */ - - seg = base_ptr >> 4; - - cli(); /* Enter with interrupts disabled! */ - - /* Set up the real-mode kernel stack */ - _SS = seg; - _SP = heap_end; - - _DS = _ES = _FS = _GS = seg; - jmp_far(seg+0x20, 0); /* Run the kernel */ - -If your boot sector accesses a floppy drive, it is recommended to -switch off the floppy motor before running the kernel, since the -kernel boot leaves interrupts off and thus the motor will not be -switched off, especially if the loaded kernel has the floppy driver as -a demand-loaded module! - - -**** ADVANCED BOOT LOADER HOOKS - -If the boot loader runs in a particularly hostile environment (such as -LOADLIN, which runs under DOS) it may be impossible to follow the -standard memory location requirements. Such a boot loader may use the -following hooks that, if set, are invoked by the kernel at the -appropriate time. The use of these hooks should probably be -considered an absolutely last resort! - -IMPORTANT: All the hooks are required to preserve %esp, %ebp, %esi and -%edi across invocation. - - realmode_swtch: - A 16-bit real mode far subroutine invoked immediately before - entering protected mode. The default routine disables NMI, so - your routine should probably do so, too. - - code32_start: - A 32-bit flat-mode routine *jumped* to immediately after the - transition to protected mode, but before the kernel is - uncompressed. No segments, except CS, are guaranteed to be - set up (current kernels do, but older ones do not); you should - set them up to BOOT_DS (0x18) yourself. - - After completing your hook, you should jump to the address - that was in this field before your boot loader overwrote it - (relocated, if appropriate.) - - -**** 32-bit BOOT PROTOCOL - -For machine with some new BIOS other than legacy BIOS, such as EFI, -LinuxBIOS, etc, and kexec, the 16-bit real mode setup code in kernel -based on legacy BIOS can not be used, so a 32-bit boot protocol needs -to be defined. - -In 32-bit boot protocol, the first step in loading a Linux kernel -should be to setup the boot parameters (struct boot_params, -traditionally known as "zero page"). The memory for struct boot_params -should be allocated and initialized to all zero. Then the setup header -from offset 0x01f1 of kernel image on should be loaded into struct -boot_params and examined. The end of setup header can be calculated as -follow: - - 0x0202 + byte value at offset 0x0201 - -In addition to read/modify/write the setup header of the struct -boot_params as that of 16-bit boot protocol, the boot loader should -also fill the additional fields of the struct boot_params as that -described in zero-page.txt. - -After setupping the struct boot_params, the boot loader can load the -32/64-bit kernel in the same way as that of 16-bit boot protocol. - -In 32-bit boot protocol, the kernel is started by jumping to the -32-bit kernel entry point, which is the start address of loaded -32/64-bit kernel. - -At entry, the CPU must be in 32-bit protected mode with paging -disabled; a GDT must be loaded with the descriptors for selectors -__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat -segment; __BOOS_CS must have execute/read permission, and __BOOT_DS -must have read/write permission; CS must be __BOOT_CS and DS, ES, SS -must be __BOOT_DS; interrupt must be disabled; %esi must hold the base -address of the struct boot_params; %ebp, %edi and %ebx must be zero. diff --git a/Documentation/i386/usb-legacy-support.txt b/Documentation/i386/usb-legacy-support.txt deleted file mode 100644 index 1894cdf..0000000 --- a/Documentation/i386/usb-legacy-support.txt +++ /dev/null @@ -1,44 +0,0 @@ -USB Legacy support -~~~~~~~~~~~~~~~~~~ - -Vojtech Pavlik <vojtech@suse.cz>, January 2004 - - -Also known as "USB Keyboard" or "USB Mouse support" in the BIOS Setup is a -feature that allows one to use the USB mouse and keyboard as if they were -their classic PS/2 counterparts. This means one can use an USB keyboard to -type in LILO for example. - -It has several drawbacks, though: - -1) On some machines, the emulated PS/2 mouse takes over even when no USB - mouse is present and a real PS/2 mouse is present. In that case the extra - features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may - not be available. - -2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause - system crashes, because the SMM BIOS is not expecting to be in PAE mode. - The Intel E7505 is a typical machine where this happens. - -3) If AMD64 64-bit mode is enabled, again system crashes often happen, - because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The - BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit - yet. - -Solutions: - -Problem 1) can be solved by loading the USB drivers prior to loading the -PS/2 mouse driver. Since the PS/2 mouse driver is in 2.6 compiled into -the kernel unconditionally, this means the USB drivers need to be -compiled-in, too. - -Problem 2) can currently only be solved by either disabling HIGHMEM64G -in the kernel config or USB Legacy support in the BIOS. A BIOS update -could help, but so far no such update exists. - -Problem 3) is usually fixed by a BIOS update. Check the board -manufacturers web site. If an update is not available, disable USB -Legacy support in the BIOS. If this alone doesn't help, try also adding -idle=poll on the kernel command line. The BIOS may be entering the SMM -on the HLT instruction as well. - diff --git a/Documentation/i386/zero-page.txt b/Documentation/i386/zero-page.txt deleted file mode 100644 index 169ad42..0000000 --- a/Documentation/i386/zero-page.txt +++ /dev/null @@ -1,31 +0,0 @@ -The additional fields in struct boot_params as a part of 32-bit boot -protocol of kernel. These should be filled by bootloader or 16-bit -real-mode setup code of the kernel. References/settings to it mainly -are in: - - include/asm-x86/bootparam.h - - -Offset Proto Name Meaning -/Size - -000/040 ALL screen_info Text mode or frame buffer information - (struct screen_info) -040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) -060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information - (struct ist_info) -080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! -090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!! -0A0/010 ALL sys_desc_table System description table (struct sys_desc_table) -140/080 ALL edid_info Video mode setup (struct edid_info) -1C0/020 ALL efi_info EFI 32 information (struct efi_info) -1E0/004 ALL alk_mem_k Alternative mem check, in KB -1E4/004 ALL scratch Scratch field for the kernel setup code -1E8/001 ALL e820_entries Number of entries in e820_map (below) -1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below) -1EA/001 ALL edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer - (below) -290/040 ALL edd_mbr_sig_buffer EDD MBR signatures -2D0/A00 ALL e820_map E820 memory map table - (array of struct e820entry) -D00/1EC ALL eddbuf EDD data (array of struct edd_info) diff --git a/Documentation/x86/i386/IO-APIC.txt b/Documentation/x86/i386/IO-APIC.txt new file mode 100644 index 0000000..30b4c71 --- /dev/null +++ b/Documentation/x86/i386/IO-APIC.txt @@ -0,0 +1,119 @@ +Most (all) Intel-MP compliant SMP boards have the so-called 'IO-APIC', +which is an enhanced interrupt controller. It enables us to route +hardware interrupts to multiple CPUs, or to CPU groups. Without an +IO-APIC, interrupts from hardware will be delivered only to the +CPU which boots the operating system (usually CPU#0). + +Linux supports all variants of compliant SMP boards, including ones with +multiple IO-APICs. Multiple IO-APICs are used in high-end servers to +distribute IRQ load further. + +There are (a few) known breakages in certain older boards, such bugs are +usually worked around by the kernel. If your MP-compliant SMP board does +not boot Linux, then consult the linux-smp mailing list archives first. + +If your box boots fine with enabled IO-APIC IRQs, then your +/proc/interrupts will look like this one: + + ----------------------------> + hell:~> cat /proc/interrupts + CPU0 + 0: 1360293 IO-APIC-edge timer + 1: 4 IO-APIC-edge keyboard + 2: 0 XT-PIC cascade + 13: 1 XT-PIC fpu + 14: 1448 IO-APIC-edge ide0 + 16: 28232 IO-APIC-level Intel EtherExpress Pro 10/100 Ethernet + 17: 51304 IO-APIC-level eth0 + NMI: 0 + ERR: 0 + hell:~> + <---------------------------- + +Some interrupts are still listed as 'XT PIC', but this is not a problem; +none of those IRQ sources is performance-critical. + + +In the unlikely case that your board does not create a working mp-table, +you can use the pirq= boot parameter to 'hand-construct' IRQ entries. This +is non-trivial though and cannot be automated. One sample /etc/lilo.conf +entry: + + append="pirq=15,11,10" + +The actual numbers depend on your system, on your PCI cards and on their +PCI slot position. Usually PCI slots are 'daisy chained' before they are +connected to the PCI chipset IRQ routing facility (the incoming PIRQ1-4 +lines): + + ,-. ,-. ,-. ,-. ,-. + PIRQ4 ----| |-. ,-| |-. ,-| |-. ,-| |--------| | + |S| \ / |S| \ / |S| \ / |S| |S| + PIRQ3 ----|l|-. `/---|l|-. `/---|l|-. `/---|l|--------|l| + |o| \/ |o| \/ |o| \/ |o| |o| + PIRQ2 ----|t|-./`----|t|-./`----|t|-./`----|t|--------|t| + |1| /\ |2| /\ |3| /\ |4| |5| + PIRQ1 ----| |- `----| |- `----| |- `----| |--------| | + `-' `-' `-' `-' `-' + +Every PCI card emits a PCI IRQ, which can be INTA, INTB, INTC or INTD: + + ,-. + INTD--| | + |S| + INTC--|l| + |o| + INTB--|t| + |x| + INTA--| | + `-' + +These INTA-D PCI IRQs are always 'local to the card', their real meaning +depends on which slot they are in. If you look at the daisy chaining diagram, +a card in slot4, issuing INTA IRQ, it will end up as a signal on PIRQ4 of +the PCI chipset. Most cards issue INTA, this creates optimal distribution +between the PIRQ lines. (distributing IRQ sources properly is not a +necessity, PCI IRQs can be shared at will, but it's a good for performance +to have non shared interrupts). Slot5 should be used for videocards, they +do not use interrupts normally, thus they are not daisy chained either. + +so if you have your SCSI card (IRQ11) in Slot1, Tulip card (IRQ9) in +Slot2, then you'll have to specify this pirq= line: + + append="pirq=11,9" + +the following script tries to figure out such a default pirq= line from +your PCI configuration: + + echo -n pirq=; echo `scanpci | grep T_L | cut -c56-` | sed 's/ /,/g' + +note that this script wont work if you have skipped a few slots or if your +board does not do default daisy-chaining. (or the IO-APIC has the PIRQ pins +connected in some strange way). E.g. if in the above case you have your SCSI +card (IRQ11) in Slot3, and have Slot1 empty: + + append="pirq=0,9,11" + +[value '0' is a generic 'placeholder', reserved for empty (or non-IRQ emitting) +slots.] + +Generally, it's always possible to find out the correct pirq= settings, just +permute all IRQ numbers properly ... it will take some time though. An +'incorrect' pirq line will cause the booting process to hang, or a device +won't function properly (e.g. if it's inserted as a module). + +If you have 2 PCI buses, then you can use up to 8 pirq values, although such +boards tend to have a good configuration. + +Be prepared that it might happen that you need some strange pirq line: + + append="pirq=0,0,0,0,0,0,9,11" + +Use smart trial-and-error techniques to find out the correct pirq line ... + +Good luck and mail to linux-smp@vger.kernel.org or +linux-kernel@vger.kernel.org if you have any problems that are not covered +by this document. + +-- mingo + diff --git a/Documentation/x86/i386/boot.txt b/Documentation/x86/i386/boot.txt new file mode 100644 index 0000000..147bfe5 --- /dev/null +++ b/Documentation/x86/i386/boot.txt @@ -0,0 +1,900 @@ + THE LINUX/x86 BOOT PROTOCOL + --------------------------- + +On the x86 platform, the Linux kernel uses a rather complicated boot +convention. This has evolved partially due to historical aspects, as +well as the desire in the early days to have the kernel itself be a +bootable image, the complicated PC memory model and due to changed +expectations in the PC industry caused by the effective demise of +real-mode DOS as a mainstream operating system. + +Currently, the following versions of the Linux/x86 boot protocol exist. + +Old kernels: zImage/Image support only. Some very early kernels + may not even support a command line. + +Protocol 2.00: (Kernel 1.3.73) Added bzImage and initrd support, as + well as a formalized way to communicate between the + boot loader and the kernel. setup.S made relocatable, + although the traditional setup area still assumed + writable. + +Protocol 2.01: (Kernel 1.3.76) Added a heap overrun warning. + +Protocol 2.02: (Kernel 2.4.0-test3-pre3) New command line protocol. + Lower the conventional memory ceiling. No overwrite + of the traditional setup area, thus making booting + safe for systems which use the EBDA from SMM or 32-bit + BIOS entry points. zImage deprecated but still + supported. + +Protocol 2.03: (Kernel 2.4.18-pre1) Explicitly makes the highest possible + initrd address available to the bootloader. + +Protocol 2.04: (Kernel 2.6.14) Extend the syssize field to four bytes. + +Protocol 2.05: (Kernel 2.6.20) Make protected mode kernel relocatable. + Introduce relocatable_kernel and kernel_alignment fields. + +Protocol 2.06: (Kernel 2.6.22) Added a field that contains the size of + the boot command line. + +Protocol 2.07: (Kernel 2.6.24) Added paravirtualised boot protocol. + Introduced hardware_subarch and hardware_subarch_data + and KEEP_SEGMENTS flag in load_flags. + +Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format + payload. Introduced payload_offset and payload length + fields to aid in locating the payload. + +Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical + pointer to single linked list of struct setup_data. + +**** MEMORY LAYOUT + +The traditional memory map for the kernel loader, used for Image or +zImage kernels, typically looks like: + + | | +0A0000 +------------------------+ + | Reserved for BIOS | Do not use. Reserved for BIOS EBDA. +09A000 +------------------------+ + | Command line | + | Stack/heap | For use by the kernel real-mode code. +098000 +------------------------+ + | Kernel setup | The kernel real-mode code. +090200 +------------------------+ + | Kernel boot sector | The kernel legacy boot sector. +090000 +------------------------+ + | Protected-mode kernel | The bulk of the kernel image. +010000 +------------------------+ + | Boot loader | <- Boot sector entry point 0000:7C00 +001000 +------------------------+ + | Reserved for MBR/BIOS | +000800 +------------------------+ + | Typically used by MBR | +000600 +------------------------+ + | BIOS use only | +000000 +------------------------+ + + +When using bzImage, the protected-mode kernel was relocated to +0x100000 ("high memory"), and the kernel real-mode block (boot sector, +setup, and stack/heap) was made relocatable to any address between +0x10000 and end of low memory. Unfortunately, in protocols 2.00 and +2.01 the 0x90000+ memory range is still used internally by the kernel; +the 2.02 protocol resolves that problem. + +It is desirable to keep the "memory ceiling" -- the highest point in +low memory touched by the boot loader -- as low as possible, since +some newer BIOSes have begun to allocate some rather large amounts of +memory, called the Extended BIOS Data Area, near the top of low +memory. The boot loader should use the "INT 12h" BIOS call to verify +how much low memory is available. + +Unfortunately, if INT 12h reports that the amount of memory is too +low, there is usually nothing the boot loader can do but to report an +error to the user. The boot loader should therefore be designed to +take up as little space in low memory as it reasonably can. For +zImage or old bzImage kernels, which need data written into the +0x90000 segment, the boot loader should make sure not to use memory +above the 0x9A000 point; too many BIOSes will break above that point. + +For a modern bzImage kernel with boot protocol version >= 2.02, a +memory layout like the following is suggested: + + ~ ~ + | Protected-mode kernel | +100000 +------------------------+ + | I/O memory hole | +0A0000 +------------------------+ + | Reserved for BIOS | Leave as much as possible unused + ~ ~ + | Command line | (Can also be below the X+10000 mark) +X+10000 +------------------------+ + | Stack/heap | For use by the kernel real-mode code. +X+08000 +------------------------+ + | Kernel setup | The kernel real-mode code. + | Kernel boot sector | The kernel legacy boot sector. +X +------------------------+ + | Boot loader | <- Boot sector entry point 0000:7C00 +001000 +------------------------+ + | Reserved for MBR/BIOS | +000800 +------------------------+ + | Typically used by MBR | +000600 +------------------------+ + | BIOS use only | +000000 +------------------------+ + +... where the address X is as low as the design of the boot loader +permits. + + +**** THE REAL-MODE KERNEL HEADER + +In the following text, and anywhere in the kernel boot sequence, "a +sector" refers to 512 bytes. It is independent of the actual sector +size of the underlying medium. + +The first step in loading a Linux kernel should be to load the +real-mode code (boot sector and setup code) and then examine the +following header at offset 0x01f1. The real-mode code can total up to +32K, although the boot loader may choose to load only the first two +sectors (1K) and then examine the bootup sector size. + +The header looks like: + +Offset Proto Name Meaning +/Size + +01F1/1 ALL(1 setup_sects The size of the setup in sectors +01F2/2 ALL root_flags If set, the root is mounted readonly +01F4/4 2.04+(2 syssize The size of the 32-bit code in 16-byte paras +01F8/2 ALL ram_size DO NOT USE - for bootsect.S use only +01FA/2 ALL vid_mode Video mode control +01FC/2 ALL root_dev Default root device number +01FE/2 ALL boot_flag 0xAA55 magic number +0200/2 2.00+ jump Jump instruction +0202/4 2.00+ header Magic signature "HdrS" +0206/2 2.00+ version Boot protocol version supported +0208/4 2.00+ realmode_swtch Boot loader hook (see below) +020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) +020E/2 2.00+ kernel_version Pointer to kernel version string +0210/1 2.00+ type_of_loader Boot loader identifier +0211/1 2.00+ loadflags Boot protocol option flags +0212/2 2.00+ setup_move_size Move to high memory size (used with hooks) +0214/4 2.00+ code32_start Boot loader hook (see below) +0218/4 2.00+ ramdisk_image initrd load address (set by boot loader) +021C/4 2.00+ ramdisk_size initrd size (set by boot loader) +0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only +0224/2 2.01+ heap_end_ptr Free memory after setup end +0226/2 N/A pad1 Unused +0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line +022C/4 2.03+ initrd_addr_max Highest legal initrd address +0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel +0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not +0235/3 N/A pad2 Unused +0238/4 2.06+ cmdline_size Maximum size of the kernel command line +023C/4 2.07+ hardware_subarch Hardware subarchitecture +0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data +0248/4 2.08+ payload_offset Offset of kernel payload +024C/4 2.08+ payload_length Length of kernel payload +0250/8 2.09+ setup_data 64-bit physical pointer to linked list + of struct setup_data + +(1) For backwards compatibility, if the setup_sects field contains 0, the + real value is 4. + +(2) For boot protocol prior to 2.04, the upper two bytes of the syssize + field are unusable, which means the size of a bzImage kernel + cannot be determined. + +If the "HdrS" (0x53726448) magic number is not found at offset 0x202, +the boot protocol version is "old". Loading an old kernel, the +following parameters should be assumed: + + Image type = zImage + initrd not supported + Real-mode kernel must be located at 0x90000. + +Otherwise, the "version" field contains the protocol version, +e.g. protocol version 2.01 will contain 0x0201 in this field. When +setting fields in the header, you must make sure only to set fields +supported by the protocol version in use. + + +**** DETAILS OF HEADER FIELDS + +For each field, some are information from the kernel to the bootloader +("read"), some are expected to be filled out by the bootloader +("write"), and some are expected to be read and modified by the +bootloader ("modify"). + +All general purpose boot loaders should write the fields marked +(obligatory). Boot loaders who want to load the kernel at a +nonstandard address should fill in the fields marked (reloc); other +boot loaders can ignore those fields. + +The byte order of all fields is littleendian (this is x86, after all.) + +Field name: setup_sects +Type: read +Offset/size: 0x1f1/1 +Protocol: ALL + + The size of the setup code in 512-byte sectors. If this field is + 0, the real value is 4. The real-mode code consists of the boot + sector (always one 512-byte sector) plus the setup code. + +Field name: root_flags +Type: modify (optional) +Offset/size: 0x1f2/2 +Protocol: ALL + + If this field is nonzero, the root defaults to readonly. The use of + this field is deprecated; use the "ro" or "rw" options on the + command line instead. + +Field name: syssize +Type: read +Offset/size: 0x1f4/4 (protocol 2.04+) 0x1f4/2 (protocol ALL) +Protocol: 2.04+ + + The size of the protected-mode code in units of 16-byte paragraphs. + For protocol versions older than 2.04 this field is only two bytes + wide, and therefore cannot be trusted for the size of a kernel if + the LOAD_HIGH flag is set. + +Field name: ram_size +Type: kernel internal +Offset/size: 0x1f8/2 +Protocol: ALL + + This field is obsolete. + +Field name: vid_mode +Type: modify (obligatory) +Offset/size: 0x1fa/2 + + Please see the section on SPECIAL COMMAND LINE OPTIONS. + +Field name: root_dev +Type: modify (optional) +Offset/size: 0x1fc/2 +Protocol: ALL + + The default root device device number. The use of this field is + deprecated, use the "root=" option on the command line instead. + +Field name: boot_flag +Type: read +Offset/size: 0x1fe/2 +Protocol: ALL + + Contains 0xAA55. This is the closest thing old Linux kernels have + to a magic number. + +Field name: jump +Type: read +Offset/size: 0x200/2 +Protocol: 2.00+ + + Contains an x86 jump instruction, 0xEB followed by a signed offset + relative to byte 0x202. This can be used to determine the size of + the header. + +Field name: header +Type: read +Offset/size: 0x202/4 +Protocol: 2.00+ + + Contains the magic number "HdrS" (0x53726448). + +Field name: version +Type: read +Offset/size: 0x206/2 +Protocol: 2.00+ + + Contains the boot protocol version, in (major << 8)+minor format, + e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version + 10.17. + +Field name: readmode_swtch +Type: modify (optional) +Offset/size: 0x208/4 +Protocol: 2.00+ + + Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) + +Field name: start_sys +Type: read +Offset/size: 0x20c/4 +Protocol: 2.00+ + + The load low segment (0x1000). Obsolete. + +Field name: kernel_version +Type: read +Offset/size: 0x20e/2 +Protocol: 2.00+ + + If set to a nonzero value, contains a pointer to a NUL-terminated + human-readable kernel version number string, less 0x200. This can + be used to display the kernel version to the user. This value + should be less than (0x200*setup_sects). + + For example, if this value is set to 0x1c00, the kernel version + number string can be found at offset 0x1e00 in the kernel file. + This is a valid value if and only if the "setup_sects" field + contains the value 15 or higher, as: + + 0x1c00 < 15*0x200 (= 0x1e00) but + 0x1c00 >= 14*0x200 (= 0x1c00) + + 0x1c00 >> 9 = 14, so the minimum value for setup_secs is 15. + +Field name: type_of_loader +Type: write (obligatory) +Offset/size: 0x210/1 +Protocol: 2.00+ + + If your boot loader has an assigned id (see table below), enter + 0xTV here, where T is an identifier for the boot loader and V is + a version number. Otherwise, enter 0xFF here. + + Assigned boot loader ids: + 0 LILO (0x00 reserved for pre-2.00 bootloader) + 1 Loadlin + 2 bootsect-loader (0x20, all other values reserved) + 3 SYSLINUX + 4 EtherBoot + 5 ELILO + 7 GRuB + 8 U-BOOT + 9 Xen + A Gujin + B Qemu + + Please contact <hpa@zytor.com> if you need a bootloader ID + value assigned. + +Field name: loadflags +Type: modify (obligatory) +Offset/size: 0x211/1 +Protocol: 2.00+ + + This field is a bitmask. + + Bit 0 (read): LOADED_HIGH + - If 0, the protected-mode code is loaded at 0x10000. + - If 1, the protected-mode code is loaded at 0x100000. + + Bit 5 (write): QUIET_FLAG + - If 0, print early messages. + - If 1, suppress early messages. + This requests to the kernel (decompressor and early + kernel) to not write early messages that require + accessing the display hardware directly. + + Bit 6 (write): KEEP_SEGMENTS + Protocol: 2.07+ + - If 0, reload the segment registers in the 32bit entry point. + - If 1, do not reload the segment registers in the 32bit entry point. + Assume that %cs %ds %ss %es are all set to flat segments with + a base of 0 (or the equivalent for their environment). + + Bit 7 (write): CAN_USE_HEAP + Set this bit to 1 to indicate that the value entered in the + heap_end_ptr is valid. If this field is clear, some setup code + functionality will be disabled. + +Field name: setup_move_size +Type: modify (obligatory) +Offset/size: 0x212/2 +Protocol: 2.00-2.01 + + When using protocol 2.00 or 2.01, if the real mode kernel is not + loaded at 0x90000, it gets moved there later in the loading + sequence. Fill in this field if you want additional data (such as + the kernel command line) moved in addition to the real-mode kernel + itself. + + The unit is bytes starting with the beginning of the boot sector. + + This field is can be ignored when the protocol is 2.02 or higher, or + if the real-mode code is loaded at 0x90000. + +Field name: code32_start +Type: modify (optional, reloc) +Offset/size: 0x214/4 +Protocol: 2.00+ + + The address to jump to in protected mode. This defaults to the load + address of the kernel, and can be used by the boot loader to + determine the proper load address. + + This field can be modified for two purposes: + + 1. as a boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) + + 2. if a bootloader which does not install a hook loads a + relocatable kernel at a nonstandard address it will have to modify + this field to point to the load address. + +Field name: ramdisk_image +Type: write (obligatory) +Offset/size: 0x218/4 +Protocol: 2.00+ + + The 32-bit linear address of the initial ramdisk or ramfs. Leave at + zero if there is no initial ramdisk/ramfs. + +Field name: ramdisk_size +Type: write (obligatory) +Offset/size: 0x21c/4 +Protocol: 2.00+ + + Size of the initial ramdisk or ramfs. Leave at zero if there is no + initial ramdisk/ramfs. + +Field name: bootsect_kludge +Type: kernel internal +Offset/size: 0x220/4 +Protocol: 2.00+ + + This field is obsolete. + +Field name: heap_end_ptr +Type: write (obligatory) +Offset/size: 0x224/2 +Protocol: 2.01+ + + Set this field to the offset (from the beginning of the real-mode + code) of the end of the setup stack/heap, minus 0x0200. + +Field name: cmd_line_ptr +Type: write (obligatory) +Offset/size: 0x228/4 +Protocol: 2.02+ + + Set this field to the linear address of the kernel command line. + The kernel command line can be located anywhere between the end of + the setup heap and 0xA0000; it does not have to be located in the + same 64K segment as the real-mode code itself. + + Fill in this field even if your boot loader does not support a + command line, in which case you can point this to an empty string + (or better yet, to the string "auto".) If this field is left at + zero, the kernel will assume that your boot loader does not support + the 2.02+ protocol. + +Field name: initrd_addr_max +Type: read +Offset/size: 0x22c/4 +Protocol: 2.03+ + + The maximum address that may be occupied by the initial + ramdisk/ramfs contents. For boot protocols 2.02 or earlier, this + field is not present, and the maximum address is 0x37FFFFFF. (This + address is defined as the address of the highest safe byte, so if + your ramdisk is exactly 131072 bytes long and this field is + 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) + +Field name: kernel_alignment +Type: read (reloc) +Offset/size: 0x230/4 +Protocol: 2.05+ + + Alignment unit required by the kernel (if relocatable_kernel is true.) + +Field name: relocatable_kernel +Type: read (reloc) +Offset/size: 0x234/1 +Protocol: 2.05+ + + If this field is nonzero, the protected-mode part of the kernel can + be loaded at any address that satisfies the kernel_alignment field. + After loading, the boot loader must set the code32_start field to + point to the loaded code, or to a boot loader hook. + +Field name: cmdline_size +Type: read +Offset/size: 0x238/4 +Protocol: 2.06+ + + The maximum size of the command line without the terminating + zero. This means that the command line can contain at most + cmdline_size characters. With protocol version 2.05 and earlier, the + maximum size was 255. + +Field name: hardware_subarch +Type: write (optional, defaults to x86/PC) +Offset/size: 0x23c/4 +Protocol: 2.07+ + + In a paravirtualized environment the hardware low level architectural + pieces such as interrupt handling, page table handling, and + accessing process control registers needs to be done differently. + + This field allows the bootloader to inform the kernel we are in one + one of those environments. + + 0x00000000 The default x86/PC environment + 0x00000001 lguest + 0x00000002 Xen + +Field name: hardware_subarch_data +Type: write (subarch-dependent) +Offset/size: 0x240/8 +Protocol: 2.07+ + + A pointer to data that is specific to hardware subarch + This field is currently unused for the default x86/PC environment, + do not modify. + +Field name: payload_offset +Type: read +Offset/size: 0x248/4 +Protocol: 2.08+ + + If non-zero then this field contains the offset from the end of the + real-mode code to the payload. + + The payload may be compressed. The format of both the compressed and + uncompressed data should be determined using the standard magic + numbers. Currently only gzip compressed ELF is used. + +Field name: payload_length +Type: read +Offset/size: 0x24c/4 +Protocol: 2.08+ + + The length of the payload. + +Field name: setup_data +Type: write (special) +Offset/size: 0x250/8 +Protocol: 2.09+ + + The 64-bit physical pointer to NULL terminated single linked list of + struct setup_data. This is used to define a more extensible boot + parameters passing mechanism. The definition of struct setup_data is + as follow: + + struct setup_data { + u64 next; + u32 type; + u32 len; + u8 data[0]; + }; + + Where, the next is a 64-bit physical pointer to the next node of + linked list, the next field of the last node is 0; the type is used + to identify the contents of data; the len is the length of data + field; the data holds the real payload. + + This list may be modified at a number of points during the bootup + process. Therefore, when modifying this list one should always make + sure to consider the case where the linked list already contains + entries. + + +**** THE IMAGE CHECKSUM + +From boot protocol version 2.08 onwards the CRC-32 is calculated over +the entire file using the characteristic polynomial 0x04C11DB7 and an +initial remainder of 0xffffffff. The checksum is appended to the +file; therefore the CRC of the file up to the limit specified in the +syssize field of the header is always 0. + + +**** THE KERNEL COMMAND LINE + +The kernel command line has become an important way for the boot +loader to communicate with the kernel. Some of its options are also +relevant to the boot loader itself, see "special command line options" +below. + +The kernel command line is a null-terminated string. The maximum +length can be retrieved from the field cmdline_size. Before protocol +version 2.06, the maximum was 255 characters. A string that is too +long will be automatically truncated by the kernel. + +If the boot protocol version is 2.02 or later, the address of the +kernel command line is given by the header field cmd_line_ptr (see +above.) This address can be anywhere between the end of the setup +heap and 0xA0000. + +If the protocol version is *not* 2.02 or higher, the kernel +command line is entered using the following protocol: + + At offset 0x0020 (word), "cmd_line_magic", enter the magic + number 0xA33F. + + At offset 0x0022 (word), "cmd_line_offset", enter the offset + of the kernel command line (relative to the start of the + real-mode kernel). + + The kernel command line *must* be within the memory region + covered by setup_move_size, so you may need to adjust this + field. + + +**** MEMORY LAYOUT OF THE REAL-MODE CODE + +The real-mode code requires a stack/heap to be set up, as well as +memory allocated for the kernel command line. This needs to be done +in the real-mode accessible memory in bottom megabyte. + +It should be noted that modern machines often have a sizable Extended +BIOS Data Area (EBDA). As a result, it is advisable to use as little +of the low megabyte as possible. + +Unfortunately, under the following circumstances the 0x90000 memory +segment has to be used: + + - When loading a zImage kernel ((loadflags & 0x01) == 0). + - When loading a 2.01 or earlier boot protocol kernel. + + -> For the 2.00 and 2.01 boot protocols, the real-mode code + can be loaded at another address, but it is internally + relocated to 0x90000. For the "old" protocol, the + real-mode code must be loaded at 0x90000. + +When loading at 0x90000, avoid using memory above 0x9a000. + +For boot protocol 2.02 or higher, the command line does not have to be +located in the same 64K segment as the real-mode setup code; it is +thus permitted to give the stack/heap the full 64K segment and locate +the command line above it. + +The kernel command line should not be located below the real-mode +code, nor should it be located in high memory. + + +**** SAMPLE BOOT CONFIGURATION + +As a sample configuration, assume the following layout of the real +mode segment: + + When loading below 0x90000, use the entire segment: + + 0x0000-0x7fff Real mode kernel + 0x8000-0xdfff Stack and heap + 0xe000-0xffff Kernel command line + + When loading at 0x90000 OR the protocol version is 2.01 or earlier: + + 0x0000-0x7fff Real mode kernel + 0x8000-0x97ff Stack and heap + 0x9800-0x9fff Kernel command line + +Such a boot loader should enter the following fields in the header: + + unsigned long base_ptr; /* base address for real-mode segment */ + + if ( setup_sects == 0 ) { + setup_sects = 4; + } + + if ( protocol >= 0x0200 ) { + type_of_loader = <type code>; + if ( loading_initrd ) { + ramdisk_image = <initrd_address>; + ramdisk_size = <initrd_size>; + } + + if ( protocol >= 0x0202 && loadflags & 0x01 ) + heap_end = 0xe000; + else + heap_end = 0x9800; + + if ( protocol >= 0x0201 ) { + heap_end_ptr = heap_end - 0x200; + loadflags |= 0x80; /* CAN_USE_HEAP */ + } + + if ( protocol >= 0x0202 ) { + cmd_line_ptr = base_ptr + heap_end; + strcpy(cmd_line_ptr, cmdline); + } else { + cmd_line_magic = 0xA33F; + cmd_line_offset = heap_end; + setup_move_size = heap_end + strlen(cmdline)+1; + strcpy(base_ptr+cmd_line_offset, cmdline); + } + } else { + /* Very old kernel */ + + heap_end = 0x9800; + + cmd_line_magic = 0xA33F; + cmd_line_offset = heap_end; + + /* A very old kernel MUST have its real-mode code + loaded at 0x90000 */ + + if ( base_ptr != 0x90000 ) { + /* Copy the real-mode kernel */ + memcpy(0x90000, base_ptr, (setup_sects+1)*512); + base_ptr = 0x90000; /* Relocated */ + } + + strcpy(0x90000+cmd_line_offset, cmdline); + + /* It is recommended to clear memory up to the 32K mark */ + memset(0x90000 + (setup_sects+1)*512, 0, + (64-(setup_sects+1))*512); + } + + +**** LOADING THE REST OF THE KERNEL + +The 32-bit (non-real-mode) kernel starts at offset (setup_sects+1)*512 +in the kernel file (again, if setup_sects == 0 the real value is 4.) +It should be loaded at address 0x10000 for Image/zImage kernels and +0x100000 for bzImage kernels. + +The kernel is a bzImage kernel if the protocol >= 2.00 and the 0x01 +bit (LOAD_HIGH) in the loadflags field is set: + + is_bzImage = (protocol >= 0x0200) && (loadflags & 0x01); + load_address = is_bzImage ? 0x100000 : 0x10000; + +Note that Image/zImage kernels can be up to 512K in size, and thus use +the entire 0x10000-0x90000 range of memory. This means it is pretty +much a requirement for these kernels to load the real-mode part at +0x90000. bzImage kernels allow much more flexibility. + + +**** SPECIAL COMMAND LINE OPTIONS + +If the command line provided by the boot loader is entered by the +user, the user may expect the following command line options to work. +They should normally not be deleted from the kernel command line even +though not all of them are actually meaningful to the kernel. Boot +loader authors who need additional command line options for the boot +loader itself should get them registered in +Documentation/kernel-parameters.txt to make sure they will not +conflict with actual kernel options now or in the future. + + vga=<mode> + <mode> here is either an integer (in C notation, either + decimal, octal, or hexadecimal) or one of the strings + "normal" (meaning 0xFFFF), "ext" (meaning 0xFFFE) or "ask" + (meaning 0xFFFD). This value should be entered into the + vid_mode field, as it is used by the kernel before the command + line is parsed. + + mem=<size> + <size> is an integer in C notation optionally followed by + (case insensitive) K, M, G, T, P or E (meaning << 10, << 20, + << 30, << 40, << 50 or << 60). This specifies the end of + memory to the kernel. This affects the possible placement of + an initrd, since an initrd should be placed near end of + memory. Note that this is an option to *both* the kernel and + the bootloader! + + initrd=<file> + An initrd should be loaded. The meaning of <file> is + obviously bootloader-dependent, and some boot loaders + (e.g. LILO) do not have such a command. + +In addition, some boot loaders add the following options to the +user-specified command line: + + BOOT_IMAGE=<file> + The boot image which was loaded. Again, the meaning of <file> + is obviously bootloader-dependent. + + auto + The kernel was booted without explicit user intervention. + +If these options are added by the boot loader, it is highly +recommended that they are located *first*, before the user-specified +or configuration-specified command line. Otherwise, "init=/bin/sh" +gets confused by the "auto" option. + + +**** RUNNING THE KERNEL + +The kernel is started by jumping to the kernel entry point, which is +located at *segment* offset 0x20 from the start of the real mode +kernel. This means that if you loaded your real-mode kernel code at +0x90000, the kernel entry point is 9020:0000. + +At entry, ds = es = ss should point to the start of the real-mode +kernel code (0x9000 if the code is loaded at 0x90000), sp should be +set up properly, normally pointing to the top of the heap, and +interrupts should be disabled. Furthermore, to guard against bugs in +the kernel, it is recommended that the boot loader sets fs = gs = ds = +es = ss. + +In our example from above, we would do: + + /* Note: in the case of the "old" kernel protocol, base_ptr must + be == 0x90000 at this point; see the previous sample code */ + + seg = base_ptr >> 4; + + cli(); /* Enter with interrupts disabled! */ + + /* Set up the real-mode kernel stack */ + _SS = seg; + _SP = heap_end; + + _DS = _ES = _FS = _GS = seg; + jmp_far(seg+0x20, 0); /* Run the kernel */ + +If your boot sector accesses a floppy drive, it is recommended to +switch off the floppy motor before running the kernel, since the +kernel boot leaves interrupts off and thus the motor will not be +switched off, especially if the loaded kernel has the floppy driver as +a demand-loaded module! + + +**** ADVANCED BOOT LOADER HOOKS + +If the boot loader runs in a particularly hostile environment (such as +LOADLIN, which runs under DOS) it may be impossible to follow the +standard memory location requirements. Such a boot loader may use the +following hooks that, if set, are invoked by the kernel at the +appropriate time. The use of these hooks should probably be +considered an absolutely last resort! + +IMPORTANT: All the hooks are required to preserve %esp, %ebp, %esi and +%edi across invocation. + + realmode_swtch: + A 16-bit real mode far subroutine invoked immediately before + entering protected mode. The default routine disables NMI, so + your routine should probably do so, too. + + code32_start: + A 32-bit flat-mode routine *jumped* to immediately after the + transition to protected mode, but before the kernel is + uncompressed. No segments, except CS, are guaranteed to be + set up (current kernels do, but older ones do not); you should + set them up to BOOT_DS (0x18) yourself. + + After completing your hook, you should jump to the address + that was in this field before your boot loader overwrote it + (relocated, if appropriate.) + + +**** 32-bit BOOT PROTOCOL + +For machine with some new BIOS other than legacy BIOS, such as EFI, +LinuxBIOS, etc, and kexec, the 16-bit real mode setup code in kernel +based on legacy BIOS can not be used, so a 32-bit boot protocol needs +to be defined. + +In 32-bit boot protocol, the first step in loading a Linux kernel +should be to setup the boot parameters (struct boot_params, +traditionally known as "zero page"). The memory for struct boot_params +should be allocated and initialized to all zero. Then the setup header +from offset 0x01f1 of kernel image on should be loaded into struct +boot_params and examined. The end of setup header can be calculated as +follow: + + 0x0202 + byte value at offset 0x0201 + +In addition to read/modify/write the setup header of the struct +boot_params as that of 16-bit boot protocol, the boot loader should +also fill the additional fields of the struct boot_params as that +described in zero-page.txt. + +After setupping the struct boot_params, the boot loader can load the +32/64-bit kernel in the same way as that of 16-bit boot protocol. + +In 32-bit boot protocol, the kernel is started by jumping to the +32-bit kernel entry point, which is the start address of loaded +32/64-bit kernel. + +At entry, the CPU must be in 32-bit protected mode with paging +disabled; a GDT must be loaded with the descriptors for selectors +__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat +segment; __BOOS_CS must have execute/read permission, and __BOOT_DS +must have read/write permission; CS must be __BOOT_CS and DS, ES, SS +must be __BOOT_DS; interrupt must be disabled; %esi must hold the base +address of the struct boot_params; %ebp, %edi and %ebx must be zero. diff --git a/Documentation/x86/i386/usb-legacy-support.txt b/Documentation/x86/i386/usb-legacy-support.txt new file mode 100644 index 0000000..1894cdf --- /dev/null +++ b/Documentation/x86/i386/usb-legacy-support.txt @@ -0,0 +1,44 @@ +USB Legacy support +~~~~~~~~~~~~~~~~~~ + +Vojtech Pavlik <vojtech@suse.cz>, January 2004 + + +Also known as "USB Keyboard" or "USB Mouse support" in the BIOS Setup is a +feature that allows one to use the USB mouse and keyboard as if they were +their classic PS/2 counterparts. This means one can use an USB keyboard to +type in LILO for example. + +It has several drawbacks, though: + +1) On some machines, the emulated PS/2 mouse takes over even when no USB + mouse is present and a real PS/2 mouse is present. In that case the extra + features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may + not be available. + +2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause + system crashes, because the SMM BIOS is not expecting to be in PAE mode. + The Intel E7505 is a typical machine where this happens. + +3) If AMD64 64-bit mode is enabled, again system crashes often happen, + because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The + BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit + yet. + +Solutions: + +Problem 1) can be solved by loading the USB drivers prior to loading the +PS/2 mouse driver. Since the PS/2 mouse driver is in 2.6 compiled into +the kernel unconditionally, this means the USB drivers need to be +compiled-in, too. + +Problem 2) can currently only be solved by either disabling HIGHMEM64G +in the kernel config or USB Legacy support in the BIOS. A BIOS update +could help, but so far no such update exists. + +Problem 3) is usually fixed by a BIOS update. Check the board +manufacturers web site. If an update is not available, disable USB +Legacy support in the BIOS. If this alone doesn't help, try also adding +idle=poll on the kernel command line. The BIOS may be entering the SMM +on the HLT instruction as well. + diff --git a/Documentation/x86/i386/zero-page.txt b/Documentation/x86/i386/zero-page.txt new file mode 100644 index 0000000..169ad42 --- /dev/null +++ b/Documentation/x86/i386/zero-page.txt @@ -0,0 +1,31 @@ +The additional fields in struct boot_params as a part of 32-bit boot +protocol of kernel. These should be filled by bootloader or 16-bit +real-mode setup code of the kernel. References/settings to it mainly +are in: + + include/asm-x86/bootparam.h + + +Offset Proto Name Meaning +/Size + +000/040 ALL screen_info Text mode or frame buffer information + (struct screen_info) +040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) +060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information + (struct ist_info) +080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! +090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!! +0A0/010 ALL sys_desc_table System description table (struct sys_desc_table) +140/080 ALL edid_info Video mode setup (struct edid_info) +1C0/020 ALL efi_info EFI 32 information (struct efi_info) +1E0/004 ALL alk_mem_k Alternative mem check, in KB +1E4/004 ALL scratch Scratch field for the kernel setup code +1E8/001 ALL e820_entries Number of entries in e820_map (below) +1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below) +1EA/001 ALL edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer + (below) +290/040 ALL edd_mbr_sig_buffer EDD MBR signatures +2D0/A00 ALL e820_map E820 memory map table + (array of struct e820entry) +D00/1EC ALL eddbuf EDD data (array of struct edd_info) diff --git a/Documentation/x86/x86_64/00-INDEX b/Documentation/x86/x86_64/00-INDEX new file mode 100644 index 0000000..92fc20a --- /dev/null +++ b/Documentation/x86/x86_64/00-INDEX @@ -0,0 +1,16 @@ +00-INDEX + - This file +boot-options.txt + - AMD64-specific boot options. +cpu-hotplug-spec + - Firmware support for CPU hotplug under Linux/x86-64 +fake-numa-for-cpusets + - Using numa=fake and CPUSets for Resource Management +kernel-stacks + - Context-specific per-processor interrupt stacks. +machinecheck + - Configurable sysfs parameters for the x86-64 machine check code. +mm.txt + - Memory layout of x86-64 (4 level page tables, 46 bits physical). +uefi.txt + - Booting Linux via Unified Extensible Firmware Interface. diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt new file mode 100644 index 0000000..b0c7b6c --- /dev/null +++ b/Documentation/x86/x86_64/boot-options.txt @@ -0,0 +1,314 @@ +AMD64 specific boot options + +There are many others (usually documented in driver documentation), but +only the AMD64 specific ones are listed here. + +Machine check + + mce=off disable machine check + mce=bootlog Enable logging of machine checks left over from booting. + Disabled by default on AMD because some BIOS leave bogus ones. + If your BIOS doesn't do that it's a good idea to enable though + to make sure you log even machine check events that result + in a reboot. On Intel systems it is enabled by default. + mce=nobootlog + Disable boot machine check logging. + mce=tolerancelevel (number) + 0: always panic on uncorrected errors, log corrected errors + 1: panic or SIGBUS on uncorrected errors, log corrected errors + 2: SIGBUS or log uncorrected errors, log corrected errors + 3: never panic or SIGBUS, log all errors (for testing only) + Default is 1 + Can be also set using sysfs which is preferable. + + nomce (for compatibility with i386): same as mce=off + + Everything else is in sysfs now. + +APICs + + apic Use IO-APIC. Default + + noapic Don't use the IO-APIC. + + disableapic Don't use the local APIC + + nolapic Don't use the local APIC (alias for i386 compatibility) + + pirq=... See Documentation/i386/IO-APIC.txt + + noapictimer Don't set up the APIC timer + + no_timer_check Don't check the IO-APIC timer. This can work around + problems with incorrect timer initialization on some boards. + + apicmaintimer Run time keeping from the local APIC timer instead + of using the PIT/HPET interrupt for this. This is useful + when the PIT/HPET interrupts are unreliable. + + noapicmaintimer Don't do time keeping using the APIC timer. + Useful when this option was auto selected, but doesn't work. + + apicpmtimer + Do APIC timer calibration using the pmtimer. Implies + apicmaintimer. Useful when your PIT timer is totally + broken. + + disable_8254_timer / enable_8254_timer + Enable interrupt 0 timer routing over the 8254 in addition to over + the IO-APIC. The kernel tries to set a sensible default. + +Early Console + + syntax: earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + + The early console is useful when the kernel crashes before the + normal console is initialized. It is not enabled by + default because it has some cosmetic problems. + Append ,keep to not disable it when the real console takes over. + Only vga or serial at a time, not both. + Currently only ttyS0 and ttyS1 are supported. + Interaction with the standard serial driver is not very good. + The VGA output is eventually overwritten by the real console. + +Timing + + notsc + Don't use the CPU time stamp counter to read the wall time. + This can be used to work around timing problems on multiprocessor systems + with not properly synchronized CPUs. + + report_lost_ticks + Report when timer interrupts are lost because some code turned off + interrupts for too long. + + nmi_watchdog=NUMBER[,panic] + NUMBER can be: + 0 don't use an NMI watchdog + 1 use the IO-APIC timer for the NMI watchdog + 2 use the local APIC for the NMI watchdog using a performance counter. Note + This will use one performance counter and the local APIC's performance + vector. + When panic is specified panic when an NMI watchdog timeout occurs. + This is useful when you use a panic=... timeout and need the box + quickly up again. + + nohpet + Don't use the HPET timer. + +Idle loop + + idle=poll + Don't do power saving in the idle loop using HLT, but poll for rescheduling + event. This will make the CPUs eat a lot more power, but may be useful + to get slightly better performance in multiprocessor benchmarks. It also + makes some profiling using performance counters more accurate. + Please note that on systems with MONITOR/MWAIT support (like Intel EM64T + CPUs) this option has no performance advantage over the normal idle loop. + It may also interact badly with hyperthreading. + +Rebooting + + reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old] + bios Use the CPU reboot vector for warm reset + warm Don't set the cold reboot flag + cold Set the cold reboot flag + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + acpi Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the + ACPI reset does not work, the reboot path attempts the reset using + the keyboard controller. + efi Use efi reset_system runtime service. If EFI is not configured or the + EFI reset does not work, the reboot path attempts the reset using + the keyboard controller. + + Using warm reset will be much faster especially on big memory + systems because the BIOS will not go through the memory check. + Disadvantage is that not all hardware will be completely reinitialized + on reboot so there may be boot problems on some systems. + + reboot=force + + Don't stop other CPUs on reboot. This can make reboot more reliable + in some cases. + +Non Executable Mappings + + noexec=on|off + + on Enable(default) + off Disable + +SMP + + additional_cpus=NUM Allow NUM more CPUs for hotplug + (defaults are specified by the BIOS, see Documentation/x86_64/cpu-hotplug-spec) + +NUMA + + numa=off Only set up a single NUMA node spanning all memory. + + numa=noacpi Don't parse the SRAT table for NUMA setup + + numa=fake=CMDLINE + If a number, fakes CMDLINE nodes and ignores NUMA setup of the + actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256,*128 + gives two 512M nodes, a 1024M node, four 256M nodes, and the + rest split into 128M chunks. If the last character of CMDLINE + is a *, the remaining memory is divided up equally among its + coefficient: + numa=fake=2*512,2* + gives two 512M nodes and the rest split into two nodes. + Otherwise, the remaining system RAM is allocated to an + additional node. + + numa=hotadd=percent + Only allow hotadd memory to preallocate page structures upto + percent of already available memory. + numa=hotadd=0 will disable hotadd memory. + +ACPI + + acpi=off Don't enable ACPI + acpi=ht Use ACPI boot table parsing, but don't enable ACPI + interpreter + acpi=force Force ACPI on (currently not needed) + + acpi=strict Disable out of spec ACPI workarounds. + + acpi_sci={edge,level,high,low} Set up ACPI SCI interrupt. + + acpi=noirq Don't route interrupts + +PCI + + pci=off Don't use PCI + pci=conf1 Use conf1 access. + pci=conf2 Use conf2 access. + pci=rom Assign ROMs. + pci=assign-busses Assign busses + pci=irqmask=MASK Set PCI interrupt mask to MASK + pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says. + pci=noacpi Don't use ACPI to set up PCI interrupt routing. + +IOMMU (input/output memory management unit) + + Currently four x86-64 PCI-DMA mapping implementations exist: + + 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all + (e.g. because you have < 3 GB memory). + Kernel boot message: "PCI-DMA: Disabling IOMMU" + + 2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU. + Kernel boot message: "PCI-DMA: using GART IOMMU" + + 3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used + e.g. if there is no hardware IOMMU in the system and it is need because + you have >3GB memory or told the kernel to us it (iommu=soft)) + Kernel boot message: "PCI-DMA: Using software bounce buffering + for IO (SWIOTLB)" + + 4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM + pSeries and xSeries servers. This hardware IOMMU supports DMA address + mapping with memory protection, etc. + Kernel boot message: "PCI-DMA: Using Calgary IOMMU" + + iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>] + [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge] + [,noaperture][,calgary] + + General iommu options: + off Don't initialize and use any kind of IOMMU. + noforce Don't force hardware IOMMU usage when it is not needed. + (default). + force Force the use of the hardware IOMMU even when it is + not actually needed (e.g. because < 3 GB memory). + soft Use software bounce buffering (SWIOTLB) (default for + Intel machines). This can be used to prevent the usage + of an available hardware IOMMU. + + iommu options only relevant to the AMD GART hardware IOMMU: + <size> Set the size of the remapping area in bytes. + allowed Overwrite iommu off workarounds for specific chipsets. + fullflush Flush IOMMU on each allocation (default). + nofullflush Don't use IOMMU fullflush. + leak Turn on simple iommu leak tracing (only when + CONFIG_IOMMU_LEAK is on). Default number of leak pages + is 20. + memaper[=<order>] Allocate an own aperture over RAM with size 32MB<<order. + (default: order=1, i.e. 64MB) + merge Do scatter-gather (SG) merging. Implies "force" + (experimental). + nomerge Don't do scatter-gather (SG) merging. + noaperture Ask the IOMMU not to touch the aperture for AGP. + forcesac Force single-address cycle (SAC) mode for masks <40bits + (experimental). + noagp Don't initialize the AGP driver and use full aperture. + allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB. + DAC is used with 32-bit PCI to push a 64-bit address in + two cycles. When off all DMA over >4GB is forced through + an IOMMU or software bounce buffering. + nodac Forbid DAC mode, i.e. DMA >4GB. + panic Always panic when IOMMU overflows. + calgary Use the Calgary IOMMU if it is available + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: + swiotlb=<pages>[,force] + <pages> Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: + + calgary=[64k,128k,256k,512k,1M,2M,4M,8M] + calgary=[translate_empty_slots] + calgary=[disable=<PCI bus number>] + panic Always panic when IOMMU overflows + + 64k,...,8M - Set the size of each PCI slot's translation table + when using the Calgary IOMMU. This is the size of the translation + table itself in main memory. The smallest table, 64k, covers an IO + space of 32MB; the largest, 8MB table, can cover an IO space of + 4GB. Normally the kernel will make the right choice by itself. + + translate_empty_slots - Enable translation even on slots that have + no devices attached to them, in case a device will be hotplugged + in the future. + + disable=<PCI bus number> - Disable translation on a given PHB. For + example, the built-in graphics adapter resides on the first bridge + (PCI bus number 0); if translation (isolation) is enabled on this + bridge, X servers that access the hardware directly from user + space might stop working. Use this option if you have devices that + are accessed from userspace directly on some PCI host bridge. + +Debugging + + oops=panic Always panic on oopses. Default is to just kill the process, + but there is a small probability of deadlocking the machine. + This will also cause panics on machine check exceptions. + Useful together with panic=30 to trigger a reboot. + + kstack=N Print N words from the kernel stack in oops dumps. + + pagefaulttrace Dump all page faults. Only useful for extreme debugging + and will create a lot of output. + + call_trace=[old|both|newfallback|new] + old: use old inexact backtracer + new: use new exact dwarf2 unwinder + both: print entries from both + newfallback: use new unwinder but fall back to old if it gets + stuck (default) + +Miscellaneous + + nogbpages + Do not use GB pages for kernel direct mappings. + gbpages + Use GB pages for kernel direct mappings. diff --git a/Documentation/x86/x86_64/cpu-hotplug-spec b/Documentation/x86/x86_64/cpu-hotplug-spec new file mode 100644 index 0000000..3c23e05 --- /dev/null +++ b/Documentation/x86/x86_64/cpu-hotplug-spec @@ -0,0 +1,21 @@ +Firmware support for CPU hotplug under Linux/x86-64 +--------------------------------------------------- + +Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to +know in advance of boot time the maximum number of CPUs that could be plugged +into the system. ACPI 3.0 currently has no official way to supply +this information from the firmware to the operating system. + +In ACPI each CPU needs an LAPIC object in the MADT table (5.2.11.5 in the +ACPI 3.0 specification). ACPI already has the concept of disabled LAPIC +objects by setting the Enabled bit in the LAPIC object to zero. + +For CPU hotplug Linux/x86-64 expects now that any possible future hotpluggable +CPU is already available in the MADT. If the CPU is not available yet +it should have its LAPIC Enabled bit set to 0. Linux will use the number +of disabled LAPICs to compute the maximum number of future CPUs. + +In the worst case the user can overwrite this choice using a command line +option (additional_cpus=...), but it is recommended to supply the correct +number (or a reasonable approximation of it, with erring towards more not less) +in the MADT to avoid manual configuration. diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets b/Documentation/x86/x86_64/fake-numa-for-cpusets new file mode 100644 index 0000000..d1a985c --- /dev/null +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets @@ -0,0 +1,66 @@ +Using numa=fake and CPUSets for Resource Management +Written by David Rientjes <rientjes@cs.washington.edu> + +This document describes how the numa=fake x86_64 command-line option can be used +in conjunction with cpusets for coarse memory management. Using this feature, +you can create fake NUMA nodes that represent contiguous chunks of memory and +assign them to cpusets and their attached tasks. This is a way of limiting the +amount of system memory that are available to a certain class of tasks. + +For more information on the features of cpusets, see Documentation/cpusets.txt. +There are a number of different configurations you can use for your needs. For +more information on the numa=fake command line option and its various ways of +configuring fake nodes, see Documentation/x86_64/boot-options.txt. + +For the purposes of this introduction, we'll assume a very primitive NUMA +emulation setup of "numa=fake=4*512,". This will split our system memory into +four equal chunks of 512M each that we can now use to assign to cpusets. As +you become more familiar with using this combination for resource control, +you'll determine a better setup to minimize the number of nodes you have to deal +with. + +A machine may be split as follows with "numa=fake=4*512," as reported by dmesg: + + Faking node 0 at 0000000000000000-0000000020000000 (512MB) + Faking node 1 at 0000000020000000-0000000040000000 (512MB) + Faking node 2 at 0000000040000000-0000000060000000 (512MB) + Faking node 3 at 0000000060000000-0000000080000000 (512MB) + ... + On node 0 totalpages: 130975 + On node 1 totalpages: 131072 + On node 2 totalpages: 131072 + On node 3 totalpages: 131072 + +Now following the instructions for mounting the cpusets filesystem from +Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory +address spaces) to individual cpusets: + + [root@xroads /]# mkdir exampleset + [root@xroads /]# mount -t cpuset none exampleset + [root@xroads /]# mkdir exampleset/ddset + [root@xroads /]# cd exampleset/ddset + [root@xroads /exampleset/ddset]# echo 0-1 > cpus + [root@xroads /exampleset/ddset]# echo 0-1 > mems + +Now this cpuset, 'ddset', will only allowed access to fake nodes 0 and 1 for +memory allocations (1G). + +You can now assign tasks to these cpusets to limit the memory resources +available to them according to the fake nodes assigned as mems: + + [root@xroads /exampleset/ddset]# echo $$ > tasks + [root@xroads /exampleset/ddset]# dd if=/dev/zero of=tmp bs=1024 count=1G + [1] 13425 + +Notice the difference between the system memory usage as reported by +/proc/meminfo between the restricted cpuset case above and the unrestricted +case (i.e. running the same 'dd' command without assigning it to a fake NUMA +cpuset): + Unrestricted Restricted + MemTotal: 3091900 kB 3091900 kB + MemFree: 42113 kB 1513236 kB + +This allows for coarse memory management for the tasks you assign to particular +cpusets. Since cpusets can form a hierarchy, you can create some pretty +interesting combinations of use-cases for various classes of tasks for your +memory management needs. diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks new file mode 100644 index 0000000..5ad65d5 --- /dev/null +++ b/Documentation/x86/x86_64/kernel-stacks @@ -0,0 +1,99 @@ +Most of the text from Keith Owens, hacked by AK + +x86_64 page size (PAGE_SIZE) is 4K. + +Like all other architectures, x86_64 has a kernel stack for every +active thread. These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big. +These stacks contain useful data as long as a thread is alive or a +zombie. While the thread is in user space the kernel stack is empty +except for the thread_info structure at the bottom. + +In addition to the per thread stacks, there are specialized stacks +associated with each CPU. These stacks are only used while the kernel +is in control on that CPU; when a CPU returns to user space the +specialized stacks contain no useful data. The main CPU stacks are: + +* Interrupt stack. IRQSTACKSIZE + + Used for external hardware interrupts. If this is the first external + hardware interrupt (i.e. not a nested hardware interrupt) then the + kernel switches from the current task to the interrupt stack. Like + the split thread and interrupt stacks on i386 (with CONFIG_4KSTACKS), + this gives more room for kernel interrupt processing without having + to increase the size of every per thread stack. + + The interrupt stack is also used when processing a softirq. + +Switching to the kernel interrupt stack is done by software based on a +per CPU interrupt nest counter. This is needed because x86-64 "IST" +hardware stacks cannot nest without races. + +x86_64 also has a feature which is not available on i386, the ability +to automatically switch to a new stack for designated events such as +double fault or NMI, which makes it easier to handle these unusual +events on x86_64. This feature is called the Interrupt Stack Table +(IST). There can be up to 7 IST entries per CPU. The IST code is an +index into the Task State Segment (TSS). The IST entries in the TSS +point to dedicated stacks; each stack can be a different size. + +An IST is selected by a non-zero value in the IST field of an +interrupt-gate descriptor. When an interrupt occurs and the hardware +loads such a descriptor, the hardware automatically sets the new stack +pointer based on the IST value, then invokes the interrupt handler. If +software wants to allow nested IST interrupts then the handler must +adjust the IST values on entry to and exit from the interrupt handler. +(This is occasionally done, e.g. for debug exceptions.) + +Events with different IST codes (i.e. with different stacks) can be +nested. For example, a debug interrupt can safely be interrupted by an +NMI. arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack +pointers on entry to and exit from all IST events, in theory allowing +IST events with the same code to be nested. However in most cases, the +stack size allocated to an IST assumes no nesting for the same code. +If that assumption is ever broken then the stacks will become corrupt. + +The currently assigned IST stacks are :- + +* STACKFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 12 - Stack Fault Exception (#SS). + + This allows the CPU to recover from invalid stack segments. Rarely + happens. + +* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 8 - Double Fault Exception (#DF). + + Invoked when handling one exception causes another exception. Happens + when the kernel is very confused (e.g. kernel stack pointer corrupt). + Using a separate stack allows the kernel to recover from it well enough + in many cases to still output an oops. + +* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for non-maskable interrupts (NMI). + + NMI can be delivered at any time, including when the kernel is in the + middle of switching stacks. Using IST for NMI events avoids making + assumptions about the previous state of the kernel stack. + +* DEBUG_STACK. DEBUG_STKSZ + + Used for hardware debug interrupts (interrupt 1) and for software + debug interrupts (INT3). + + When debugging a kernel, debug interrupts (both hardware and + software) can occur at any time. Using IST for these interrupts + avoids making assumptions about the previous state of the kernel + stack. + +* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). + + Used for interrupt 18 - Machine Check Exception (#MC). + + MCE can be delivered at any time, including when the kernel is in the + middle of switching stacks. Using IST for MCE events avoids making + assumptions about the previous state of the kernel stack. + +For more details see the Intel IA32 or AMD AMD64 architecture manuals. diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck new file mode 100644 index 0000000..a05e58e --- /dev/null +++ b/Documentation/x86/x86_64/machinecheck @@ -0,0 +1,77 @@ + +Configurable sysfs parameters for the x86-64 machine check code. + +Machine checks report internal hardware error conditions detected +by the CPU. Uncorrected errors typically cause a machine check +(often with panic), corrected ones cause a machine check log entry. + +Machine checks are organized in banks (normally associated with +a hardware subsystem) and subevents in a bank. The exact meaning +of the banks and subevent is CPU specific. + +mcelog knows how to decode them. + +When you see the "Machine check errors logged" message in the system +log then mcelog should run to collect and decode machine check entries +from /dev/mcelog. Normally mcelog should be run regularly from a cronjob. + +Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN +(N = CPU number) + +The directory contains some configurable entries: + +Entries: + +bankNctl +(N bank number) + 64bit Hex bitmask enabling/disabling specific subevents for bank N + When a bit in the bitmask is zero then the respective + subevent will not be reported. + By default all events are enabled. + Note that BIOS maintain another mask to disable specific events + per bank. This is not visible here + +The following entries appear for each CPU, but they are truly shared +between all CPUs. + +check_interval + How often to poll for corrected machine check errors, in seconds + (Note output is hexademical). Default 5 minutes. When the poller + finds MCEs it triggers an exponential speedup (poll more often) on + the polling interval. When the poller stops finding MCEs, it + triggers an exponential backoff (poll less often) on the polling + interval. The check_interval variable is both the initial and + maximum polling interval. + +tolerant + Tolerance level. When a machine check exception occurs for a non + corrected machine check the kernel can take different actions. + Since machine check exceptions can happen any time it is sometimes + risky for the kernel to kill a process because it defies + normal kernel locking rules. The tolerance level configures + how hard the kernel tries to recover even at some risk of + deadlock. Higher tolerant values trade potentially better uptime + with the risk of a crash or even corruption (for tolerant >= 3). + + 0: always panic on uncorrected errors, log corrected errors + 1: panic or SIGBUS on uncorrected errors, log corrected errors + 2: SIGBUS or log uncorrected errors, log corrected errors + 3: never panic or SIGBUS, log all errors (for testing only) + + Default: 1 + + Note this only makes a difference if the CPU allows recovery + from a machine check exception. Current x86 CPUs generally do not. + +trigger + Program to run when a machine check event is detected. + This is an alternative to running mcelog regularly from cron + and allows to detect events faster. + +TBD document entries for AMD threshold interrupt configuration + +For more details about the x86 machine check architecture +see the Intel and AMD architecture manuals from their developer websites. + +For more details about the architecture see +see http://one.firstfloor.org/~andi/mce.pdf diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt new file mode 100644 index 0000000..b89b6d2 --- /dev/null +++ b/Documentation/x86/x86_64/mm.txt @@ -0,0 +1,29 @@ + +<previous description obsolete, deleted> + +Virtual memory map with 4 level page tables: + +0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm +hole caused by [48:63] sign extension +ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole +ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory +ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole +ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space +ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) +... unused hole ... +ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0 +... unused hole ... +ffffffff88000000 - fffffffffff00000 (=1919 MB) module mapping space + +The direct mapping covers all memory in the system up to the highest +memory address (this means in some cases it can also include PCI memory +holes). + +vmalloc space is lazily synchronized into the different PML4 pages of +the processes using the page fault handler, with init_level4_pgt as +reference. + +Current X86-64 implementations only support 40 bits of address space, +but we support up to 46 bits. This expands into MBZ space in the page tables. + +-Andi Kleen, Jul 2004 diff --git a/Documentation/x86/x86_64/uefi.txt b/Documentation/x86/x86_64/uefi.txt new file mode 100644 index 0000000..7d77120 --- /dev/null +++ b/Documentation/x86/x86_64/uefi.txt @@ -0,0 +1,38 @@ +General note on [U]EFI x86_64 support +------------------------------------- + +The nomenclature EFI and UEFI are used interchangeably in this document. + +Although the tools below are _not_ needed for building the kernel, +the needed bootloader support and associated tools for x86_64 platforms +with EFI firmware and specifications are listed below. + +1. UEFI specification: http://www.uefi.org + +2. Booting Linux kernel on UEFI x86_64 platform requires bootloader + support. Elilo with x86_64 support can be used. + +3. x86_64 platform with EFI/UEFI firmware. + +Mechanics: +--------- +- Build the kernel with the following configuration. + CONFIG_FB_EFI=y + CONFIG_FRAMEBUFFER_CONSOLE=y + If EFI runtime services are expected, the following configuration should + be selected. + CONFIG_EFI=y + CONFIG_EFI_VARS=y or m # optional +- Create a VFAT partition on the disk +- Copy the following to the VFAT partition: + elilo bootloader with x86_64 support, elilo configuration file, + kernel image built in first step and corresponding + initrd. Instructions on building elilo and its dependencies + can be found in the elilo sourceforge project. +- Boot to EFI shell and invoke elilo choosing the kernel image built + in first step. +- If some or all EFI runtime services don't work, you can try following + kernel command line parameters to turn off some or all EFI runtime + services. + noefi turn off all EFI runtime services + reboot_type=k turn off EFI reboot runtime service diff --git a/Documentation/x86_64/00-INDEX b/Documentation/x86_64/00-INDEX deleted file mode 100644 index 92fc20a..0000000 --- a/Documentation/x86_64/00-INDEX +++ /dev/null @@ -1,16 +0,0 @@ -00-INDEX - - This file -boot-options.txt - - AMD64-specific boot options. -cpu-hotplug-spec - - Firmware support for CPU hotplug under Linux/x86-64 -fake-numa-for-cpusets - - Using numa=fake and CPUSets for Resource Management -kernel-stacks - - Context-specific per-processor interrupt stacks. -machinecheck - - Configurable sysfs parameters for the x86-64 machine check code. -mm.txt - - Memory layout of x86-64 (4 level page tables, 46 bits physical). -uefi.txt - - Booting Linux via Unified Extensible Firmware Interface. diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt deleted file mode 100644 index b0c7b6c..0000000 --- a/Documentation/x86_64/boot-options.txt +++ /dev/null @@ -1,314 +0,0 @@ -AMD64 specific boot options - -There are many others (usually documented in driver documentation), but -only the AMD64 specific ones are listed here. - -Machine check - - mce=off disable machine check - mce=bootlog Enable logging of machine checks left over from booting. - Disabled by default on AMD because some BIOS leave bogus ones. - If your BIOS doesn't do that it's a good idea to enable though - to make sure you log even machine check events that result - in a reboot. On Intel systems it is enabled by default. - mce=nobootlog - Disable boot machine check logging. - mce=tolerancelevel (number) - 0: always panic on uncorrected errors, log corrected errors - 1: panic or SIGBUS on uncorrected errors, log corrected errors - 2: SIGBUS or log uncorrected errors, log corrected errors - 3: never panic or SIGBUS, log all errors (for testing only) - Default is 1 - Can be also set using sysfs which is preferable. - - nomce (for compatibility with i386): same as mce=off - - Everything else is in sysfs now. - -APICs - - apic Use IO-APIC. Default - - noapic Don't use the IO-APIC. - - disableapic Don't use the local APIC - - nolapic Don't use the local APIC (alias for i386 compatibility) - - pirq=... See Documentation/i386/IO-APIC.txt - - noapictimer Don't set up the APIC timer - - no_timer_check Don't check the IO-APIC timer. This can work around - problems with incorrect timer initialization on some boards. - - apicmaintimer Run time keeping from the local APIC timer instead - of using the PIT/HPET interrupt for this. This is useful - when the PIT/HPET interrupts are unreliable. - - noapicmaintimer Don't do time keeping using the APIC timer. - Useful when this option was auto selected, but doesn't work. - - apicpmtimer - Do APIC timer calibration using the pmtimer. Implies - apicmaintimer. Useful when your PIT timer is totally - broken. - - disable_8254_timer / enable_8254_timer - Enable interrupt 0 timer routing over the 8254 in addition to over - the IO-APIC. The kernel tries to set a sensible default. - -Early Console - - syntax: earlyprintk=vga - earlyprintk=serial[,ttySn[,baudrate]] - - The early console is useful when the kernel crashes before the - normal console is initialized. It is not enabled by - default because it has some cosmetic problems. - Append ,keep to not disable it when the real console takes over. - Only vga or serial at a time, not both. - Currently only ttyS0 and ttyS1 are supported. - Interaction with the standard serial driver is not very good. - The VGA output is eventually overwritten by the real console. - -Timing - - notsc - Don't use the CPU time stamp counter to read the wall time. - This can be used to work around timing problems on multiprocessor systems - with not properly synchronized CPUs. - - report_lost_ticks - Report when timer interrupts are lost because some code turned off - interrupts for too long. - - nmi_watchdog=NUMBER[,panic] - NUMBER can be: - 0 don't use an NMI watchdog - 1 use the IO-APIC timer for the NMI watchdog - 2 use the local APIC for the NMI watchdog using a performance counter. Note - This will use one performance counter and the local APIC's performance - vector. - When panic is specified panic when an NMI watchdog timeout occurs. - This is useful when you use a panic=... timeout and need the box - quickly up again. - - nohpet - Don't use the HPET timer. - -Idle loop - - idle=poll - Don't do power saving in the idle loop using HLT, but poll for rescheduling - event. This will make the CPUs eat a lot more power, but may be useful - to get slightly better performance in multiprocessor benchmarks. It also - makes some profiling using performance counters more accurate. - Please note that on systems with MONITOR/MWAIT support (like Intel EM64T - CPUs) this option has no performance advantage over the normal idle loop. - It may also interact badly with hyperthreading. - -Rebooting - - reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old] - bios Use the CPU reboot vector for warm reset - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - acpi Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the - ACPI reset does not work, the reboot path attempts the reset using - the keyboard controller. - efi Use efi reset_system runtime service. If EFI is not configured or the - EFI reset does not work, the reboot path attempts the reset using - the keyboard controller. - - Using warm reset will be much faster especially on big memory - systems because the BIOS will not go through the memory check. - Disadvantage is that not all hardware will be completely reinitialized - on reboot so there may be boot problems on some systems. - - reboot=force - - Don't stop other CPUs on reboot. This can make reboot more reliable - in some cases. - -Non Executable Mappings - - noexec=on|off - - on Enable(default) - off Disable - -SMP - - additional_cpus=NUM Allow NUM more CPUs for hotplug - (defaults are specified by the BIOS, see Documentation/x86_64/cpu-hotplug-spec) - -NUMA - - numa=off Only set up a single NUMA node spanning all memory. - - numa=noacpi Don't parse the SRAT table for NUMA setup - - numa=fake=CMDLINE - If a number, fakes CMDLINE nodes and ignores NUMA setup of the - actual machine. Otherwise, system memory is configured - depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256,*128 - gives two 512M nodes, a 1024M node, four 256M nodes, and the - rest split into 128M chunks. If the last character of CMDLINE - is a *, the remaining memory is divided up equally among its - coefficient: - numa=fake=2*512,2* - gives two 512M nodes and the rest split into two nodes. - Otherwise, the remaining system RAM is allocated to an - additional node. - - numa=hotadd=percent - Only allow hotadd memory to preallocate page structures upto - percent of already available memory. - numa=hotadd=0 will disable hotadd memory. - -ACPI - - acpi=off Don't enable ACPI - acpi=ht Use ACPI boot table parsing, but don't enable ACPI - interpreter - acpi=force Force ACPI on (currently not needed) - - acpi=strict Disable out of spec ACPI workarounds. - - acpi_sci={edge,level,high,low} Set up ACPI SCI interrupt. - - acpi=noirq Don't route interrupts - -PCI - - pci=off Don't use PCI - pci=conf1 Use conf1 access. - pci=conf2 Use conf2 access. - pci=rom Assign ROMs. - pci=assign-busses Assign busses - pci=irqmask=MASK Set PCI interrupt mask to MASK - pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says. - pci=noacpi Don't use ACPI to set up PCI interrupt routing. - -IOMMU (input/output memory management unit) - - Currently four x86-64 PCI-DMA mapping implementations exist: - - 1. <arch/x86_64/kernel/pci-nommu.c>: use no hardware/software IOMMU at all - (e.g. because you have < 3 GB memory). - Kernel boot message: "PCI-DMA: Disabling IOMMU" - - 2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU. - Kernel boot message: "PCI-DMA: using GART IOMMU" - - 3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used - e.g. if there is no hardware IOMMU in the system and it is need because - you have >3GB memory or told the kernel to us it (iommu=soft)) - Kernel boot message: "PCI-DMA: Using software bounce buffering - for IO (SWIOTLB)" - - 4. <arch/x86_64/pci-calgary.c> : IBM Calgary hardware IOMMU. Used in IBM - pSeries and xSeries servers. This hardware IOMMU supports DMA address - mapping with memory protection, etc. - Kernel boot message: "PCI-DMA: Using Calgary IOMMU" - - iommu=[<size>][,noagp][,off][,force][,noforce][,leak[=<nr_of_leak_pages>] - [,memaper[=<order>]][,merge][,forcesac][,fullflush][,nomerge] - [,noaperture][,calgary] - - General iommu options: - off Don't initialize and use any kind of IOMMU. - noforce Don't force hardware IOMMU usage when it is not needed. - (default). - force Force the use of the hardware IOMMU even when it is - not actually needed (e.g. because < 3 GB memory). - soft Use software bounce buffering (SWIOTLB) (default for - Intel machines). This can be used to prevent the usage - of an available hardware IOMMU. - - iommu options only relevant to the AMD GART hardware IOMMU: - <size> Set the size of the remapping area in bytes. - allowed Overwrite iommu off workarounds for specific chipsets. - fullflush Flush IOMMU on each allocation (default). - nofullflush Don't use IOMMU fullflush. - leak Turn on simple iommu leak tracing (only when - CONFIG_IOMMU_LEAK is on). Default number of leak pages - is 20. - memaper[=<order>] Allocate an own aperture over RAM with size 32MB<<order. - (default: order=1, i.e. 64MB) - merge Do scatter-gather (SG) merging. Implies "force" - (experimental). - nomerge Don't do scatter-gather (SG) merging. - noaperture Ask the IOMMU not to touch the aperture for AGP. - forcesac Force single-address cycle (SAC) mode for masks <40bits - (experimental). - noagp Don't initialize the AGP driver and use full aperture. - allowdac Allow double-address cycle (DAC) mode, i.e. DMA >4GB. - DAC is used with 32-bit PCI to push a 64-bit address in - two cycles. When off all DMA over >4GB is forced through - an IOMMU or software bounce buffering. - nodac Forbid DAC mode, i.e. DMA >4GB. - panic Always panic when IOMMU overflows. - calgary Use the Calgary IOMMU if it is available - - iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU - implementation: - swiotlb=<pages>[,force] - <pages> Prereserve that many 128K pages for the software IO - bounce buffering. - force Force all IO through the software TLB. - - Settings for the IBM Calgary hardware IOMMU currently found in IBM - pSeries and xSeries machines: - - calgary=[64k,128k,256k,512k,1M,2M,4M,8M] - calgary=[translate_empty_slots] - calgary=[disable=<PCI bus number>] - panic Always panic when IOMMU overflows - - 64k,...,8M - Set the size of each PCI slot's translation table - when using the Calgary IOMMU. This is the size of the translation - table itself in main memory. The smallest table, 64k, covers an IO - space of 32MB; the largest, 8MB table, can cover an IO space of - 4GB. Normally the kernel will make the right choice by itself. - - translate_empty_slots - Enable translation even on slots that have - no devices attached to them, in case a device will be hotplugged - in the future. - - disable=<PCI bus number> - Disable translation on a given PHB. For - example, the built-in graphics adapter resides on the first bridge - (PCI bus number 0); if translation (isolation) is enabled on this - bridge, X servers that access the hardware directly from user - space might stop working. Use this option if you have devices that - are accessed from userspace directly on some PCI host bridge. - -Debugging - - oops=panic Always panic on oopses. Default is to just kill the process, - but there is a small probability of deadlocking the machine. - This will also cause panics on machine check exceptions. - Useful together with panic=30 to trigger a reboot. - - kstack=N Print N words from the kernel stack in oops dumps. - - pagefaulttrace Dump all page faults. Only useful for extreme debugging - and will create a lot of output. - - call_trace=[old|both|newfallback|new] - old: use old inexact backtracer - new: use new exact dwarf2 unwinder - both: print entries from both - newfallback: use new unwinder but fall back to old if it gets - stuck (default) - -Miscellaneous - - nogbpages - Do not use GB pages for kernel direct mappings. - gbpages - Use GB pages for kernel direct mappings. diff --git a/Documentation/x86_64/cpu-hotplug-spec b/Documentation/x86_64/cpu-hotplug-spec deleted file mode 100644 index 3c23e05..0000000 --- a/Documentation/x86_64/cpu-hotplug-spec +++ /dev/null @@ -1,21 +0,0 @@ -Firmware support for CPU hotplug under Linux/x86-64 ---------------------------------------------------- - -Linux/x86-64 supports CPU hotplug now. For various reasons Linux wants to -know in advance of boot time the maximum number of CPUs that could be plugged -into the system. ACPI 3.0 currently has no official way to supply -this information from the firmware to the operating system. - -In ACPI each CPU needs an LAPIC object in the MADT table (5.2.11.5 in the -ACPI 3.0 specification). ACPI already has the concept of disabled LAPIC -objects by setting the Enabled bit in the LAPIC object to zero. - -For CPU hotplug Linux/x86-64 expects now that any possible future hotpluggable -CPU is already available in the MADT. If the CPU is not available yet -it should have its LAPIC Enabled bit set to 0. Linux will use the number -of disabled LAPICs to compute the maximum number of future CPUs. - -In the worst case the user can overwrite this choice using a command line -option (additional_cpus=...), but it is recommended to supply the correct -number (or a reasonable approximation of it, with erring towards more not less) -in the MADT to avoid manual configuration. diff --git a/Documentation/x86_64/fake-numa-for-cpusets b/Documentation/x86_64/fake-numa-for-cpusets deleted file mode 100644 index d1a985c..0000000 --- a/Documentation/x86_64/fake-numa-for-cpusets +++ /dev/null @@ -1,66 +0,0 @@ -Using numa=fake and CPUSets for Resource Management -Written by David Rientjes <rientjes@cs.washington.edu> - -This document describes how the numa=fake x86_64 command-line option can be used -in conjunction with cpusets for coarse memory management. Using this feature, -you can create fake NUMA nodes that represent contiguous chunks of memory and -assign them to cpusets and their attached tasks. This is a way of limiting the -amount of system memory that are available to a certain class of tasks. - -For more information on the features of cpusets, see Documentation/cpusets.txt. -There are a number of different configurations you can use for your needs. For -more information on the numa=fake command line option and its various ways of -configuring fake nodes, see Documentation/x86_64/boot-options.txt. - -For the purposes of this introduction, we'll assume a very primitive NUMA -emulation setup of "numa=fake=4*512,". This will split our system memory into -four equal chunks of 512M each that we can now use to assign to cpusets. As -you become more familiar with using this combination for resource control, -you'll determine a better setup to minimize the number of nodes you have to deal -with. - -A machine may be split as follows with "numa=fake=4*512," as reported by dmesg: - - Faking node 0 at 0000000000000000-0000000020000000 (512MB) - Faking node 1 at 0000000020000000-0000000040000000 (512MB) - Faking node 2 at 0000000040000000-0000000060000000 (512MB) - Faking node 3 at 0000000060000000-0000000080000000 (512MB) - ... - On node 0 totalpages: 130975 - On node 1 totalpages: 131072 - On node 2 totalpages: 131072 - On node 3 totalpages: 131072 - -Now following the instructions for mounting the cpusets filesystem from -Documentation/cpusets.txt, you can assign fake nodes (i.e. contiguous memory -address spaces) to individual cpusets: - - [root@xroads /]# mkdir exampleset - [root@xroads /]# mount -t cpuset none exampleset - [root@xroads /]# mkdir exampleset/ddset - [root@xroads /]# cd exampleset/ddset - [root@xroads /exampleset/ddset]# echo 0-1 > cpus - [root@xroads /exampleset/ddset]# echo 0-1 > mems - -Now this cpuset, 'ddset', will only allowed access to fake nodes 0 and 1 for -memory allocations (1G). - -You can now assign tasks to these cpusets to limit the memory resources -available to them according to the fake nodes assigned as mems: - - [root@xroads /exampleset/ddset]# echo $$ > tasks - [root@xroads /exampleset/ddset]# dd if=/dev/zero of=tmp bs=1024 count=1G - [1] 13425 - -Notice the difference between the system memory usage as reported by -/proc/meminfo between the restricted cpuset case above and the unrestricted -case (i.e. running the same 'dd' command without assigning it to a fake NUMA -cpuset): - Unrestricted Restricted - MemTotal: 3091900 kB 3091900 kB - MemFree: 42113 kB 1513236 kB - -This allows for coarse memory management for the tasks you assign to particular -cpusets. Since cpusets can form a hierarchy, you can create some pretty -interesting combinations of use-cases for various classes of tasks for your -memory management needs. diff --git a/Documentation/x86_64/kernel-stacks b/Documentation/x86_64/kernel-stacks deleted file mode 100644 index 5ad65d5..0000000 --- a/Documentation/x86_64/kernel-stacks +++ /dev/null @@ -1,99 +0,0 @@ -Most of the text from Keith Owens, hacked by AK - -x86_64 page size (PAGE_SIZE) is 4K. - -Like all other architectures, x86_64 has a kernel stack for every -active thread. These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big. -These stacks contain useful data as long as a thread is alive or a -zombie. While the thread is in user space the kernel stack is empty -except for the thread_info structure at the bottom. - -In addition to the per thread stacks, there are specialized stacks -associated with each CPU. These stacks are only used while the kernel -is in control on that CPU; when a CPU returns to user space the -specialized stacks contain no useful data. The main CPU stacks are: - -* Interrupt stack. IRQSTACKSIZE - - Used for external hardware interrupts. If this is the first external - hardware interrupt (i.e. not a nested hardware interrupt) then the - kernel switches from the current task to the interrupt stack. Like - the split thread and interrupt stacks on i386 (with CONFIG_4KSTACKS), - this gives more room for kernel interrupt processing without having - to increase the size of every per thread stack. - - The interrupt stack is also used when processing a softirq. - -Switching to the kernel interrupt stack is done by software based on a -per CPU interrupt nest counter. This is needed because x86-64 "IST" -hardware stacks cannot nest without races. - -x86_64 also has a feature which is not available on i386, the ability -to automatically switch to a new stack for designated events such as -double fault or NMI, which makes it easier to handle these unusual -events on x86_64. This feature is called the Interrupt Stack Table -(IST). There can be up to 7 IST entries per CPU. The IST code is an -index into the Task State Segment (TSS). The IST entries in the TSS -point to dedicated stacks; each stack can be a different size. - -An IST is selected by a non-zero value in the IST field of an -interrupt-gate descriptor. When an interrupt occurs and the hardware -loads such a descriptor, the hardware automatically sets the new stack -pointer based on the IST value, then invokes the interrupt handler. If -software wants to allow nested IST interrupts then the handler must -adjust the IST values on entry to and exit from the interrupt handler. -(This is occasionally done, e.g. for debug exceptions.) - -Events with different IST codes (i.e. with different stacks) can be -nested. For example, a debug interrupt can safely be interrupted by an -NMI. arch/x86_64/kernel/entry.S::paranoidentry adjusts the stack -pointers on entry to and exit from all IST events, in theory allowing -IST events with the same code to be nested. However in most cases, the -stack size allocated to an IST assumes no nesting for the same code. -If that assumption is ever broken then the stacks will become corrupt. - -The currently assigned IST stacks are :- - -* STACKFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). - - Used for interrupt 12 - Stack Fault Exception (#SS). - - This allows the CPU to recover from invalid stack segments. Rarely - happens. - -* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). - - Used for interrupt 8 - Double Fault Exception (#DF). - - Invoked when handling one exception causes another exception. Happens - when the kernel is very confused (e.g. kernel stack pointer corrupt). - Using a separate stack allows the kernel to recover from it well enough - in many cases to still output an oops. - -* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). - - Used for non-maskable interrupts (NMI). - - NMI can be delivered at any time, including when the kernel is in the - middle of switching stacks. Using IST for NMI events avoids making - assumptions about the previous state of the kernel stack. - -* DEBUG_STACK. DEBUG_STKSZ - - Used for hardware debug interrupts (interrupt 1) and for software - debug interrupts (INT3). - - When debugging a kernel, debug interrupts (both hardware and - software) can occur at any time. Using IST for these interrupts - avoids making assumptions about the previous state of the kernel - stack. - -* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). - - Used for interrupt 18 - Machine Check Exception (#MC). - - MCE can be delivered at any time, including when the kernel is in the - middle of switching stacks. Using IST for MCE events avoids making - assumptions about the previous state of the kernel stack. - -For more details see the Intel IA32 or AMD AMD64 architecture manuals. diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck deleted file mode 100644 index a05e58e..0000000 --- a/Documentation/x86_64/machinecheck +++ /dev/null @@ -1,77 +0,0 @@ - -Configurable sysfs parameters for the x86-64 machine check code. - -Machine checks report internal hardware error conditions detected -by the CPU. Uncorrected errors typically cause a machine check -(often with panic), corrected ones cause a machine check log entry. - -Machine checks are organized in banks (normally associated with -a hardware subsystem) and subevents in a bank. The exact meaning -of the banks and subevent is CPU specific. - -mcelog knows how to decode them. - -When you see the "Machine check errors logged" message in the system -log then mcelog should run to collect and decode machine check entries -from /dev/mcelog. Normally mcelog should be run regularly from a cronjob. - -Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN -(N = CPU number) - -The directory contains some configurable entries: - -Entries: - -bankNctl -(N bank number) - 64bit Hex bitmask enabling/disabling specific subevents for bank N - When a bit in the bitmask is zero then the respective - subevent will not be reported. - By default all events are enabled. - Note that BIOS maintain another mask to disable specific events - per bank. This is not visible here - -The following entries appear for each CPU, but they are truly shared -between all CPUs. - -check_interval - How often to poll for corrected machine check errors, in seconds - (Note output is hexademical). Default 5 minutes. When the poller - finds MCEs it triggers an exponential speedup (poll more often) on - the polling interval. When the poller stops finding MCEs, it - triggers an exponential backoff (poll less often) on the polling - interval. The check_interval variable is both the initial and - maximum polling interval. - -tolerant - Tolerance level. When a machine check exception occurs for a non - corrected machine check the kernel can take different actions. - Since machine check exceptions can happen any time it is sometimes - risky for the kernel to kill a process because it defies - normal kernel locking rules. The tolerance level configures - how hard the kernel tries to recover even at some risk of - deadlock. Higher tolerant values trade potentially better uptime - with the risk of a crash or even corruption (for tolerant >= 3). - - 0: always panic on uncorrected errors, log corrected errors - 1: panic or SIGBUS on uncorrected errors, log corrected errors - 2: SIGBUS or log uncorrected errors, log corrected errors - 3: never panic or SIGBUS, log all errors (for testing only) - - Default: 1 - - Note this only makes a difference if the CPU allows recovery - from a machine check exception. Current x86 CPUs generally do not. - -trigger - Program to run when a machine check event is detected. - This is an alternative to running mcelog regularly from cron - and allows to detect events faster. - -TBD document entries for AMD threshold interrupt configuration - -For more details about the x86 machine check architecture -see the Intel and AMD architecture manuals from their developer websites. - -For more details about the architecture see -see http://one.firstfloor.org/~andi/mce.pdf diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt deleted file mode 100644 index b89b6d2..0000000 --- a/Documentation/x86_64/mm.txt +++ /dev/null @@ -1,29 +0,0 @@ - -<previous description obsolete, deleted> - -Virtual memory map with 4 level page tables: - -0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm -hole caused by [48:63] sign extension -ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory -ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole -ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space -ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) -... unused hole ... -ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0 -... unused hole ... -ffffffff88000000 - fffffffffff00000 (=1919 MB) module mapping space - -The direct mapping covers all memory in the system up to the highest -memory address (this means in some cases it can also include PCI memory -holes). - -vmalloc space is lazily synchronized into the different PML4 pages of -the processes using the page fault handler, with init_level4_pgt as -reference. - -Current X86-64 implementations only support 40 bits of address space, -but we support up to 46 bits. This expands into MBZ space in the page tables. - --Andi Kleen, Jul 2004 diff --git a/Documentation/x86_64/uefi.txt b/Documentation/x86_64/uefi.txt deleted file mode 100644 index 7d77120..0000000 --- a/Documentation/x86_64/uefi.txt +++ /dev/null @@ -1,38 +0,0 @@ -General note on [U]EFI x86_64 support -------------------------------------- - -The nomenclature EFI and UEFI are used interchangeably in this document. - -Although the tools below are _not_ needed for building the kernel, -the needed bootloader support and associated tools for x86_64 platforms -with EFI firmware and specifications are listed below. - -1. UEFI specification: http://www.uefi.org - -2. Booting Linux kernel on UEFI x86_64 platform requires bootloader - support. Elilo with x86_64 support can be used. - -3. x86_64 platform with EFI/UEFI firmware. - -Mechanics: ---------- -- Build the kernel with the following configuration. - CONFIG_FB_EFI=y - CONFIG_FRAMEBUFFER_CONSOLE=y - If EFI runtime services are expected, the following configuration should - be selected. - CONFIG_EFI=y - CONFIG_EFI_VARS=y or m # optional -- Create a VFAT partition on the disk -- Copy the following to the VFAT partition: - elilo bootloader with x86_64 support, elilo configuration file, - kernel image built in first step and corresponding - initrd. Instructions on building elilo and its dependencies - can be found in the elilo sourceforge project. -- Boot to EFI shell and invoke elilo choosing the kernel image built - in first step. -- If some or all EFI runtime services don't work, you can try following - kernel command line parameters to turn off some or all EFI runtime - services. - noefi turn off all EFI runtime services - reboot_type=k turn off EFI reboot runtime service -- cgit v1.1 From f0d43100f13be0fa5bf52741d7084bb27f00e621 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 29 May 2008 12:56:36 -0700 Subject: x86: extend e820 early_res support 32bit -fix #3 introduce init_pg_table_start, so xen PV could specify the value. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head32.c | 3 ++- arch/x86/kernel/head_32.S | 2 ++ arch/x86/kernel/setup_32.c | 7 +++++++ arch/x86/lguest/boot.c | 5 +++-- arch/x86/xen/enlighten.c | 3 ++- include/asm-x86/setup.h | 4 +++- 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c216d3c..0b6cb9f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -76,7 +76,8 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - reserve_early(__pa_symbol(&_end), init_pg_tables_end, "INIT_PG_TABLE"); + reserve_early(init_pg_tables_start, init_pg_tables_end, + "INIT_PG_TABLE"); reserve_ebda_region(); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b2cc737..bef4618 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -194,6 +194,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ movl $pa(pg0), %edi + movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx movl $PTE_ATTR, %eax 10: @@ -228,6 +229,7 @@ default_entry: page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi + movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx movl $PTE_ATTR, %eax 10: diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 08b47a2..de9c5ee 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -71,6 +71,7 @@ /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_start __initdata = ~0UL; unsigned long init_pg_tables_end __initdata = ~0UL; /* @@ -485,6 +486,10 @@ static void __init reserve_initrd(void) return; } + printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image, + ramdisk_end); + + if (ramdisk_end <= end_of_lowmem) { /* All in lowmem, easy case */ /* @@ -511,6 +516,8 @@ static void __init reserve_initrd(void) "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", + ramdisk_here, ramdisk_here + ramdisk_size); do_relocate_initrd = true; } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index af65b2d..bf7c34a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1011,6 +1011,7 @@ __init void lguest_init(void) * clobbered. The Launcher places our initial pagetables somewhere at * the top of our physical memory, so we don't need extra space: set * init_pg_tables_end to the end of the kernel. */ + init_pg_tables_start = __pa(pg0); init_pg_tables_end = __pa(pg0); /* Load the %fs segment register (the per-cpu segment register) with @@ -1064,9 +1065,9 @@ __init void lguest_init(void) pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call start_kernel() in init/main.c and we proceed + /* Now we're set up, call i386_start_kernel() in head32.c and we proceed * to boot as normal. It never returns. */ - start_kernel(); + i386_start_kernel(); } /* * This marks the end of stage II of our journey, The Guest. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..ccc7b84 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1211,6 +1211,7 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; init_mm.pgd = pgd; /* use the Xen pagetables to start */ @@ -1246,5 +1247,5 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); /* Start the world */ - start_kernel(); + i386_start_kernel(); } diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index ffa0f54..64f5f0c1 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -59,9 +59,11 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map); void __init add_memory_region(unsigned long long start, unsigned long long size, int type); -extern unsigned long init_pg_tables_end; +void __init i386_start_kernel(void); +extern unsigned long init_pg_tables_start; +extern unsigned long init_pg_tables_end; #endif /* __i386__ */ #endif /* _SETUP */ -- cgit v1.1 From 163872950dc856fd23849c27f60049feaac49ae6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 29 May 2008 12:57:22 -0700 Subject: x86: extend e820 early_res support 32bit -fix #4 reserve_early pgdata for 32bit numa Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf9..4774972 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -159,8 +159,13 @@ static void __init allocate_pgdat(int nid) if (nid && node_has_online_mem(nid)) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, + max_low_pfn<<PAGE_SHIFT, sizeof(pg_data_t), + PAGE_SIZE); + NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), + "NODE_DATA"); } } -- cgit v1.1 From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 29 May 2008 12:58:37 -0700 Subject: x86: extend e820 early_res support 32bit -fix #5 reserve early numa kva, so it will not clash with new RAMDISK Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 1 - arch/x86/mm/discontig_32.c | 12 +++++------- include/asm-x86/mmzone_32.h | 4 ---- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index de9c5ee..6f40cb5 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -614,7 +614,6 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); reserve_crashkernel(); reserve_ibft_region(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 4774972..55fdbab 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -357,6 +357,11 @@ unsigned long __init setup_memory(void) printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn<<PAGE_SHIFT, + (kva_start_pfn + kva_pages)<<PAGE_SHIFT, + "KVA PG"); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > system_max_low_pfn) @@ -392,13 +397,6 @@ unsigned long __init setup_memory(void) return max_low_pfn; } -void __init numa_kva_reserve(void) -{ - if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), - BOOTMEM_DEFAULT); -} - void __init zone_sizes_init(void) { int nid; diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index cb2cad0..faef751 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -38,16 +38,12 @@ static inline void get_memcfg_numa(void) } extern int early_pfn_to_nid(unsigned long pfn); -extern void numa_kva_reserve(void); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat #define get_zholes_size(n) (0) -static inline void numa_kva_reserve(void) -{ -} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM -- cgit v1.1 From 9a73aa81ffb993382afed2ed404bc2b330d75427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 29 May 2008 16:25:56 -0700 Subject: x86: 32bit numa srat fix early_ioremap leak on two node system (16g RAM) with numa config I got this crash: get_memcfg_from_srat: assigning address to rsdp RSD PTR v0 [ACPIAM] ACPI: Too big length in RSDT: 92 failed to get NUMA memory information from SRAT table NUMA - single node, flat memory mode Node: 0, start_pfn: 0, end_pfn: 153 Setting physnode_map array to node 0 for pfns: 0 ... Pid: 0, comm: swapper Not tainted 2.6.26-rc4 #4 [<80b41289>] hlt_loop+0x0/0x3 [<8011efa0>] ? alloc_remap+0x50/0x70 [<8079e32e>] alloc_node_mem_map+0x5e/0xa0 [<8012e77b>] ? printk+0x1b/0x20 [<80b590f6>] free_area_init_node+0xc6/0x470 [<80b588fc>] ? __alloc_bootmem_node+0x2c/0x50 [<80b58ad8>] ? find_min_pfn_for_node+0x38/0x70 [<8012e77b>] ? printk+0x1b/0x20 [<80b597c4>] free_area_init_nodes+0x254/0x2d0 [<80b544d7>] zone_sizes_init+0x97/0xa0 [<80b48a03>] setup_arch+0x383/0x530 [<8012e77b>] ? printk+0x1b/0x20 [<80b41aa4>] start_kernel+0x64/0x350 [<80b412d8>] i386_start_kernel+0x8/0x10 ======================= this patch increases the acpi table limit to 32. Also match early_ioremap() with early_iounmap(). Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/srat_32.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 70e4a37..88971ee 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -261,7 +261,7 @@ out_fail: struct acpi_static_rsdt { struct acpi_table_rsdt table; - u32 padding[7]; /* Allow for 7 more table entries */ + u32 padding[32]; /* Allow for 32 more table entries */ }; int __init get_memcfg_from_srat(void) @@ -297,7 +297,7 @@ int __init get_memcfg_from_srat(void) } rsdt = (struct acpi_table_rsdt *) - early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); + early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt)); if (!rsdt) { printk(KERN_WARNING @@ -310,6 +310,7 @@ int __init get_memcfg_from_srat(void) if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); + early_iounmap(rsdt, sizeof(saved_rsdt)); goto out_err; } @@ -319,37 +320,51 @@ int __init get_memcfg_from_srat(void) * size of RSDT) divided by the size of each entry * (4-byte table pointers). */ - tables = (header->length - sizeof(struct acpi_table_header)) / 4; + tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32); if (!tables) goto out_err; memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); - + early_iounmap(rsdt, sizeof(saved_rsdt)); if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", saved_rsdt.table.header.length); goto out_err; } - printk("Begin SRAT table scan....\n"); + printk("Begin SRAT table scan....%d\n", tables); - for (i = 0; i < tables; i++) { + for (i = 0; i < tables; i++){ + int result; + u32 length; /* Map in header, then map in full table length. */ header = (struct acpi_table_header *) early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); if (!header) break; + + printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n", + header->signature, + (unsigned long)saved_rsdt.table.table_offset_entry[i], + header->length); + + if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) { + early_iounmap(header, sizeof(struct acpi_table_header)); + continue; + } + + length = header->length; + early_iounmap(header, sizeof(struct acpi_table_header)); header = (struct acpi_table_header *) - early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); + early_ioremap(saved_rsdt.table.table_offset_entry[i], length); if (!header) break; - if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) - continue; - /* we've found the srat table. don't need to look at any more tables */ - return acpi20_parse_srat((struct acpi_table_srat *)header); + result = acpi20_parse_srat((struct acpi_table_srat *)header); + early_iounmap(header, length); + return result; } out_err: remove_all_active_ranges(); -- cgit v1.1 From 831d991821daedd4839073dbca55514432ef1768 Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Mon, 3 Sep 2007 10:17:39 +0200 Subject: x86: add PCI extended config space access for AMD Barcelona This patch implements PCI extended configuration space access for AMD's Barcelona CPUs. It extends the method using CF8/CFC IO addresses. An x86 capability bit has been introduced that is set for CPUs supporting PCI extended config space accesses. Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd.c | 4 ++++ arch/x86/kernel/cpu/amd_64.c | 3 +++ arch/x86/kernel/setup.c | 13 +++++++++++++ arch/x86/kernel/setup.h | 26 ++++++++++++++++++++++++++ arch/x86/kernel/setup_64.c | 2 ++ arch/x86/pci/direct.c | 21 +++++++++++++++------ include/asm-x86/cpufeature.h | 2 ++ 7 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/setup.h diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2458668..99221f9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include <asm/apic.h> #include <mach_apic.h> +#include "../setup.h" #include "cpu.h" /* @@ -308,6 +309,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + + if (c->x86 == 0x10) + amd_enable_pci_ext_cfg(c); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index c815c2c..180097e 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -217,6 +217,9 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) fam10h_check_enable_mmcfg(); + if (c->x86 == 0x10) + amd_enable_pci_ext_cfg(c); + if (amd_apic_timer_broken()) disable_apic_timer = 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b85..d8f1712 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -136,4 +136,17 @@ void __init setup_per_cpu_areas(void) setup_cpumask_of_cpu(); } +#define ENABLE_CF8_EXT_CFG (1ULL << 46) + +void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) +{ + u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); + if (!(reg & ENABLE_CF8_EXT_CFG)) { + reg |= ENABLE_CF8_EXT_CFG; + wrmsrl(MSR_AMD64_NB_CFG, reg); + } + set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG); +} + #endif diff --git a/arch/x86/kernel/setup.h b/arch/x86/kernel/setup.h new file mode 100644 index 0000000..66cc2c7 --- /dev/null +++ b/arch/x86/kernel/setup.h @@ -0,0 +1,26 @@ +/* + * Internal declarations for shared x86 setup code. + * + * Copyright (c) 2008 Advanced Micro Devices, Inc. + * Contributed by Robert Richter <robert.richter@amd.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef _ARCH_X86_KERNEL_SETUP_H + +extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); + +#endif /* _ARCH_X86_KERNEL_SETUP_H */ diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 25afdd8..215bd67 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -73,6 +73,8 @@ #include <asm/pat.h> #include <asm/mmconfig.h> +#include "setup.h" + #include <mach_apic.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 21d1e0e..27d61b6 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -8,18 +8,21 @@ #include "pci.h" /* - * Functions for accessing PCI configuration space with type 1 accesses + * Functions for accessing PCI base (first 256 bytes) and extended + * (4096 bytes per PCI function) configuration space with type 1 + * accesses. */ #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \ + | (devfn << 8) | (reg & 0xFC)) static int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 255)) { + if ((bus > 255) || (devfn > 255) || (reg > 4095)) { *value = -1; return -EINVAL; } @@ -50,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, { unsigned long flags; - if ((bus > 255) || (devfn > 255) || (reg > 255)) + if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); @@ -260,10 +263,16 @@ void __init pci_direct_init(int type) return; printk(KERN_INFO "PCI: Using configuration type %d for base access\n", type); - if (type == 1) + if (type == 1) { raw_pci_ops = &pci_direct_conf1; - else + if (!raw_pci_ext_ops && cpu_has_pci_ext_cfg) { + printk(KERN_INFO "PCI: Using configuration type 1 " + "for extended access\n"); + raw_pci_ext_ops = &pci_direct_conf1; + } + } else { raw_pci_ops = &pci_direct_conf2; + } } int __init pci_direct_probe(void) diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 0d609c8..40fcbba 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -79,6 +79,7 @@ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */ #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */ +#define X86_FEATURE_PCI_EXT_CFG (3*32+19) /* PCI extended cfg access */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -187,6 +188,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) +#define cpu_has_pci_ext_cfg boot_cpu_has(X86_FEATURE_PCI_EXT_CFG) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.1 From 6fc92866a4a6778a9732a14b61f70cb9014b2a1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 2 Jun 2008 10:54:16 +0200 Subject: fix build bug in "x86: add PCI extended config space access for AMD Barcelona" Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mmconfig.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h index 95beda0..46d6bb1 100644 --- a/include/asm-x86/mmconfig.h +++ b/include/asm-x86/mmconfig.h @@ -9,4 +9,6 @@ static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } #endif +extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); + #endif -- cgit v1.1 From c46e62f73569d7ef42255bd6f31e35925b7f1492 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Wed, 28 May 2008 12:42:57 +0200 Subject: i8259: fix final ugliness Introduce IRQx_VECTOR on 32-bit, so that #ifdef noise is kept down. There should be no object code change. [ mingo@elte.hu: merged to x86/irq not x86/i8259 due to x86/irq having restructured the vector code into asm-x86/irq_vectors.h, which this patch touches. ] Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/i8259.c | 22 ++++++++-------------- include/asm-x86/irq_vectors.h | 9 ++++++--- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 7a0fda8..dc92b49 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -297,34 +297,28 @@ void init_8259A(int auto_eoi) * outb_pic - this has to work on a wide range of PC hardware. */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ -#ifndef CONFIG_X86_64 - outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ -#else /* CONFIG_X86_64 */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, + to 0x20-0x27 on i386 */ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ - outb_pic(0x04, PIC_MASTER_IMR); -#endif /* CONFIG_X86_64 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + if (auto_eoi) /* master does Auto EOI */ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ -#ifndef CONFIG_X86_64 - outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ -#else /* CONFIG_X86_64 */ - /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + + /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); -#endif /* CONFIG_X86_64 */ if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index 3cb6d8c..b58581e 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -18,17 +18,20 @@ #endif /* - * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. - * * Reserve the lowest usable priority level 0x20 - 0x2f for triggering * cleanup after irq migration on 64 bit. */ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR /* - * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit + * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. + * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit. */ +#ifdef CONFIG_X86_32 +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR) +#else #define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#endif #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) #define IRQ3_VECTOR (IRQ0_VECTOR + 3) -- cgit v1.1 From c78277288e3d561d55fb48bc0fe8d6e2cf4d0880 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Thu, 29 May 2008 09:02:19 +0100 Subject: CONFIG_PM_SLEEP fix: xen: fix compilation when CONFIG_PM_SLEEP is disabled Xen save/restore depends on CONFIG_PM_SLEEP being set for device_power_up/down. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- drivers/xen/manage.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 675c3dd..5b546e3 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -31,6 +31,7 @@ enum shutdown_state { /* Ignore multiple shutdown requests. */ static enum shutdown_state shutting_down = SHUTDOWN_INVALID; +#ifdef CONFIG_PM_SLEEP static int xen_suspend(void *data) { int *cancelled = data; @@ -121,6 +122,7 @@ out: #endif shutting_down = SHUTDOWN_INVALID; } +#endif /* CONFIG_PM_SLEEP */ static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) -- cgit v1.1 From 1a5726528a70bb239bdd149aef7f2155cd2b1699 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 2 Jun 2008 12:21:36 +0200 Subject: fix build bug in "x86: add PCI extended config space access for AMD Barcelona" --- arch/x86/kernel/cpu/amd.c | 1 + include/asm-x86/mmconfig.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 99221f9..656b40a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -4,6 +4,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> +#include <asm/mmconfig.h> #include <mach_apic.h> #include "../setup.h" diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h index 46d6bb1..691798f 100644 --- a/include/asm-x86/mmconfig.h +++ b/include/asm-x86/mmconfig.h @@ -9,6 +9,10 @@ static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } #endif +#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); +#else +static inline void amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) { } +#endif #endif -- cgit v1.1 From c1f64a58003fd2efaa725a857e269a15f765791a Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Tue, 27 May 2008 09:47:13 -0700 Subject: x86: MMIO and gcc re-ordering issue On Tue, 27 May 2008, Linus Torvalds wrote: > > Expecting people to fix up all drivers is simply not going to happen. And > serializing things shouldn't be *that* expensive. People who cannot take > the expense can continue to use the magic __raw_writel() etc stuff. Of course, for non-x86, you kind of have to expect drivers to be well-behaved, so non-x86 can probably avoid this simply because there are less relevant drivers involved. Here's a UNTESTED patch for x86 that may or may not compile and work, and which serializes (on a compiler level) the IO accesses against regular memory accesses. __read[bwlq]()/__write[bwlq]() are not serialized with a :"memory" barrier, although since they still use "asm volatile" I suspect that i practice they are probably serial too. Did not look very closely at any generated code (only did a trivial test to see that the code looks *roughly* correct). Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/io.h | 56 ++++++++++++++++++++++++++++++++++++++ include/asm-x86/io_32.h | 49 ---------------------------------- include/asm-x86/io_64.h | 71 ------------------------------------------------- 3 files changed, 56 insertions(+), 120 deletions(-) diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h index d5b11f6..8e9eca9 100644 --- a/include/asm-x86/io.h +++ b/include/asm-x86/io.h @@ -3,6 +3,62 @@ #define ARCH_HAS_IOREMAP_WC +#include <linux/compiler.h> + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":"=" reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "q", :"memory") +build_mmio_read(readw, "w", unsigned short, "r", :"memory") +build_mmio_read(readl, "l", unsigned int, "r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "q", ) +build_mmio_read(__readw, "w", unsigned short, "r", ) +build_mmio_read(__readl, "l", unsigned int, "r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#define mmiowb() barrier() + +#ifdef CONFIG_X86_64 +build_mmio_read(readq, "q", unsigned long, "r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "r", ) +build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) + +#define readq_relaxed(a) __readq(a) +#define __raw_readq __readq +#define __raw_writeq writeq + +/* Let people know we have them */ +#define readq readq +#define writeq writeq +#endif + #ifdef CONFIG_X86_32 # include "io_32.h" #else diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h index 049e81e..d71be8d 100644 --- a/include/asm-x86/io_32.h +++ b/include/asm-x86/io_32.h @@ -149,55 +149,6 @@ extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define virt_to_bus virt_to_phys #define bus_to_virt phys_to_virt -/* - * readX/writeX() are used to access memory mapped devices. On some - * architectures the memory mapped IO stuff needs to be accessed - * differently. On the x86 architecture, we just read/write the - * memory location directly. - */ - -static inline unsigned char readb(const volatile void __iomem *addr) -{ - return *(volatile unsigned char __force *)addr; -} - -static inline unsigned short readw(const volatile void __iomem *addr) -{ - return *(volatile unsigned short __force *)addr; -} - -static inline unsigned int readl(const volatile void __iomem *addr) -{ - return *(volatile unsigned int __force *) addr; -} - -#define readb_relaxed(addr) readb(addr) -#define readw_relaxed(addr) readw(addr) -#define readl_relaxed(addr) readl(addr) -#define __raw_readb readb -#define __raw_readw readw -#define __raw_readl readl - -static inline void writeb(unsigned char b, volatile void __iomem *addr) -{ - *(volatile unsigned char __force *)addr = b; -} - -static inline void writew(unsigned short b, volatile void __iomem *addr) -{ - *(volatile unsigned short __force *)addr = b; -} - -static inline void writel(unsigned int b, volatile void __iomem *addr) -{ - *(volatile unsigned int __force *)addr = b; -} -#define __raw_writeb writeb -#define __raw_writew writew -#define __raw_writel writel - -#define mmiowb() - static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count) { diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h index 0930bed..ddd8058 100644 --- a/include/asm-x86/io_64.h +++ b/include/asm-x86/io_64.h @@ -204,77 +204,6 @@ extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define virt_to_bus virt_to_phys #define bus_to_virt phys_to_virt -/* - * readX/writeX() are used to access memory mapped devices. On some - * architectures the memory mapped IO stuff needs to be accessed - * differently. On the x86 architecture, we just read/write the - * memory location directly. - */ - -static inline __u8 __readb(const volatile void __iomem *addr) -{ - return *(__force volatile __u8 *)addr; -} - -static inline __u16 __readw(const volatile void __iomem *addr) -{ - return *(__force volatile __u16 *)addr; -} - -static __always_inline __u32 __readl(const volatile void __iomem *addr) -{ - return *(__force volatile __u32 *)addr; -} - -static inline __u64 __readq(const volatile void __iomem *addr) -{ - return *(__force volatile __u64 *)addr; -} - -#define readb(x) __readb(x) -#define readw(x) __readw(x) -#define readl(x) __readl(x) -#define readq(x) __readq(x) -#define readb_relaxed(a) readb(a) -#define readw_relaxed(a) readw(a) -#define readl_relaxed(a) readl(a) -#define readq_relaxed(a) readq(a) -#define __raw_readb readb -#define __raw_readw readw -#define __raw_readl readl -#define __raw_readq readq - -#define mmiowb() - -static inline void __writel(__u32 b, volatile void __iomem *addr) -{ - *(__force volatile __u32 *)addr = b; -} - -static inline void __writeq(__u64 b, volatile void __iomem *addr) -{ - *(__force volatile __u64 *)addr = b; -} - -static inline void __writeb(__u8 b, volatile void __iomem *addr) -{ - *(__force volatile __u8 *)addr = b; -} - -static inline void __writew(__u16 b, volatile void __iomem *addr) -{ - *(__force volatile __u16 *)addr = b; -} - -#define writeq(val, addr) __writeq((val), (addr)) -#define writel(val, addr) __writel((val), (addr)) -#define writew(val, addr) __writew((val), (addr)) -#define writeb(val, addr) __writeb((val), (addr)) -#define __raw_writeb writeb -#define __raw_writew writew -#define __raw_writel writel -#define __raw_writeq writeq - void __memcpy_fromio(void *, unsigned long, unsigned); void __memcpy_toio(unsigned long, const void *, unsigned); -- cgit v1.1 From 83bea8e1fa0c47b30664b6f92397c016c22f77fb Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Tue, 27 May 2008 21:03:46 +0200 Subject: x86: fix incomplete include guard in include/asm-x86/seccomp_32.h Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/seccomp_32.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86/seccomp_32.h b/include/asm-x86/seccomp_32.h index 18da19e..36e71c5 100644 --- a/include/asm-x86/seccomp_32.h +++ b/include/asm-x86/seccomp_32.h @@ -1,4 +1,5 @@ #ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H #include <linux/thread_info.h> -- cgit v1.1 From 6330a30a76c1e62d4b4ec238368957f8febf9113 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Wed, 28 May 2008 09:46:19 +0200 Subject: x86: break mutual header inclusion This breaks up the mutual inclusion between headers ptrace.h and vm86.h by moving some small part of vm86.h which is needed by ptrace.h into processor-flags.h. We also try to move #include lines to the top. This has been compile tested on x86_32 and x86_64 defconfig, and run through 'make headers_check'. Cc: Adrian Bunk <bunk@kernel.org> Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/processor-flags.h | 6 ++++++ include/asm-x86/ptrace.h | 8 +++++--- include/asm-x86/vm86.h | 8 +------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/asm-x86/processor-flags.h b/include/asm-x86/processor-flags.h index 199cab1..092b39b 100644 --- a/include/asm-x86/processor-flags.h +++ b/include/asm-x86/processor-flags.h @@ -88,4 +88,10 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#ifdef CONFIG_VM86 +#define X86_VM_MASK X86_EFLAGS_VM +#else +#define X86_VM_MASK 0 /* No VM86 support */ +#endif + #endif /* __ASM_I386_PROCESSOR_FLAGS_H */ diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h index 9f922b0..8a71db8 100644 --- a/include/asm-x86/ptrace.h +++ b/include/asm-x86/ptrace.h @@ -3,7 +3,12 @@ #include <linux/compiler.h> /* For __user */ #include <asm/ptrace-abi.h> +#include <asm/processor-flags.h> +#ifdef __KERNEL__ +#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */ +#include <asm/segment.h> +#endif #ifndef __ASSEMBLY__ @@ -55,9 +60,6 @@ struct pt_regs { unsigned long ss; }; -#include <asm/vm86.h> -#include <asm/segment.h> - #endif /* __KERNEL__ */ #else /* __i386__ */ diff --git a/include/asm-x86/vm86.h b/include/asm-x86/vm86.h index cbf4a0e..5ce3513 100644 --- a/include/asm-x86/vm86.h +++ b/include/asm-x86/vm86.h @@ -115,7 +115,6 @@ struct vm86plus_info_struct { unsigned long is_vm86pus:1; /* for vm86 internal use */ unsigned char vm86dbg_intxxtab[32]; /* for debugger */ }; - struct vm86plus_struct { struct vm86_regs regs; unsigned long flags; @@ -128,11 +127,7 @@ struct vm86plus_struct { #ifdef __KERNEL__ -#ifdef CONFIG_VM86 -#define X86_VM_MASK X86_EFLAGS_VM -#else -#define X86_VM_MASK 0 /* No VM86 support */ -#endif +#include <asm/ptrace.h> /* * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86 @@ -142,7 +137,6 @@ struct vm86plus_struct { * at the end of the structure. Look at ptrace.h to see the "normal" * setup. For user space layout see 'struct vm86_regs' above. */ -#include <asm/ptrace.h> struct kernel_vm86_regs { /* -- cgit v1.1 From fb093eab6d8963a085f112773284f2bcb3d7907b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 27 May 2008 16:29:20 -0700 Subject: x86: remove duplicated e820 func in setup.h we already have them in e820.h Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/setup.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 64f5f0c1..9e163fc 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -50,15 +50,9 @@ extern struct boot_params boot_params; */ #define LOWMEMSIZE() (0x9f000) -struct e820entry; - char * __init machine_specific_memory_setup(void); char *memory_setup(void); -int __init copy_e820_map(struct e820entry *biosmap, int nr_map); -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type); - void __init i386_start_kernel(void); -- cgit v1.1 From 88ff0a474e98f869d8c321e29481f298320100d6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Date: Tue, 27 May 2008 18:49:39 -0700 Subject: x86: coding style fixes for nmi.c before total: 1 errors, 6 warnings, 534 lines checked after total: 0 errors, 1 warnings, 532 lines checked Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3671a9f..bf143f1 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -23,9 +23,8 @@ #include <linux/cpumask.h> #include <linux/kernel_stat.h> #include <linux/kdebug.h> +#include <linux/smp.h> -#include <asm/smp.h> -#include <asm/nmi.h> #include <asm/proto.h> #include <asm/timer.h> @@ -45,13 +44,16 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -static int panic_on_timeout; +EXPORT_SYMBOL(nmi_active); unsigned int nmi_watchdog = NMI_DEFAULT; +EXPORT_SYMBOL(nmi_watchdog); + +static int panic_on_timeout; static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata = 0; +static int endflag __initdata; static inline unsigned int get_nmi_count(int cpu) { @@ -404,7 +406,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); - printk("NMI backtrace for cpu %d\n", cpu); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); dump_stack(); spin_unlock(&lock); cpu_clear(cpu, backtrace_mask); @@ -529,7 +531,3 @@ void __trigger_all_cpu_backtrace(void) mdelay(1); } } - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); - -- cgit v1.1 From 9f5314fb4d556d3132c784d0df47352b2830ca53 Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Wed, 28 May 2008 09:51:18 -0500 Subject: x86, uv: update macros used by UV platform Update the UV address macros to better describe the fields of UV physical addresses. Improve comments in the header files. Add additional MMR definitions. Signed-off-by: Jack Steiner <steiner@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/genx2apic_uv_x.c | 141 +++++++---- include/asm-x86/uv/uv_hub.h | 188 ++++++++++----- include/asm-x86/uv/uv_mmrs.h | 509 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 731 insertions(+), 107 deletions(-) diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ebf1390..45e84ac 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ #include <linux/threads.h> @@ -55,37 +55,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu) int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) { unsigned long val; - int nasid; + int pnode; - nasid = uv_apicid_to_nasid(phys_apicid); + pnode = uv_apicid_to_pnode(phys_apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_STARTUP; - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); return 0; } static void uv_send_IPI_one(int cpu, int vector) { unsigned long val, apicid, lapicid; - int nasid; + int pnode; apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ lapicid = apicid & 0x3f; /* ZZZ macro needed */ - nasid = uv_apicid_to_nasid(apicid); + pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << UVH_IPI_INT_APIC_ID_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); - uv_write_global_mmr64(nasid, UVH_IPI_INT, val); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(cpumask_t mask, int vector) @@ -159,39 +159,81 @@ struct genapic apic_x2apic_uv_x = { .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ }; -static __cpuinit void set_x2apic_extra_bits(int nasid) +static __cpuinit void set_x2apic_extra_bits(int pnode) { - __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6); + __get_cpu_var(x2apic_extra_bits) = (pnode << 6); } /* * Called on boot cpu. */ +static __init int boot_pnode_to_blade(int pnode) +{ + int blade; + + for (blade = 0; blade < uv_num_possible_blades(); blade++) + if (pnode == uv_blade_info[blade].pnode) + return blade; + BUG(); +} + +struct redir_addr { + unsigned long redirect; + unsigned long alias; +}; + +#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT + +static __initdata struct redir_addr redir_addrs[] = { + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, +}; + +static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +{ + union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + int i; + + for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { + alias.v = uv_read_local_mmr(redir_addrs[i].alias); + if (alias.s.base == 0) { + *size = (1UL << alias.s.m_alias); + redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + return; + } + } + BUG(); +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; - int bytes, nid, cpu, lcpu, nasid, last_nasid, blade; - unsigned long mmr_base; + union uvh_node_id_u node_id; + unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + unsigned long mmr_base, present; m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + m_val = m_n_config.s.m_skt; + n_val = m_n_config.s.n_skt; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); - last_nasid = -1; - for_each_possible_cpu(cpu) { - nid = cpu_to_node(cpu); - nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); - if (nasid != last_nasid) - uv_possible_blades++; - last_nasid = nasid; - } + for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) + uv_possible_blades += + hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = alloc_bootmem_pages(bytes); + get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); + bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = alloc_bootmem_pages(bytes); memset(uv_node_to_blade, 255, bytes); @@ -200,43 +242,56 @@ static __init void uv_system_init(void) uv_cpu_to_blade = alloc_bootmem_pages(bytes); memset(uv_cpu_to_blade, 255, bytes); - last_nasid = -1; - blade = -1; - lcpu = -1; - for_each_possible_cpu(cpu) { - nid = cpu_to_node(cpu); - nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu)); - if (nasid != last_nasid) { - blade++; - lcpu = -1; - uv_blade_info[blade].nr_posible_cpus = 0; + blade = 0; + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + for (j = 0; j < 64; j++) { + if (!test_bit(j, &present)) + continue; + uv_blade_info[blade].pnode = (i * 64 + j); + uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + blade++; } - last_nasid = nasid; - lcpu++; + } - uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt; - uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_upper = (((unsigned long)node_id.s.node_id) & + ~((1 << n_val) - 1)) << m_val; + + for_each_present_cpu(cpu) { + nid = cpu_to_node(cpu); + pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + blade = boot_pnode_to_blade(pnode); + lcpu = uv_blade_info[blade].nr_possible_cpus; + uv_blade_info[blade].nr_possible_cpus++; + + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; + uv_cpu_hub_info(cpu)->lowmem_remap_top = + lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->m_val = m_val; + uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; - uv_cpu_hub_info(cpu)->local_nasid = nasid; - uv_cpu_hub_info(cpu)->gnode_upper = - nasid & ~((1 << uv_hub_info->n_val) - 1); + uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ - uv_blade_info[blade].nasid = nasid; - uv_blade_info[blade].nr_posible_cpus++; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid); - printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade); + printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " + "lcpu %d, blade %d\n", + cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, + lcpu, blade); } } /* * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet */ void __cpuinit uv_cpu_init(void) { @@ -246,5 +301,5 @@ void __cpuinit uv_cpu_init(void) uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->local_nasid); + set_x2apic_extra_bits(uv_hub_info->pnode); } diff --git a/include/asm-x86/uv/uv_hub.h b/include/asm-x86/uv/uv_hub.h index 26b9240..6500488 100644 --- a/include/asm-x86/uv/uv_hub.h +++ b/include/asm-x86/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ #ifndef __ASM_X86_UV_HUB_H__ @@ -20,26 +20,49 @@ /* * Addressing Terminology * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. + * M - The low M bits of a physical address represent the offset + * into the blade local memory. RAM memory on a blade is physically + * contiguous (although various IO spaces may punch holes in + * it).. * - * SNASID - NASID right shifted by 1 bit. + * N - Number of bits in the node portion of a socket physical + * address. + * + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. + * + * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead + * of nasids. + * + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. + * + * + * NumaLink Global Physical Address Format: + * +--------------------------------+---------------------+ + * |00..000| GNODE | NodeOffset | + * +--------------------------------+---------------------+ + * |<-------53 - M bits --->|<--------M bits -----> + * + * M - number of node offset bits (35 .. 40) * * * Memory/UV-HUB Processor Socket Address Format: - * +--------+---------------+---------------------+ - * |00..0000| SNASID | NodeOffset | - * +--------+---------------+---------------------+ - * <--- N bits --->|<--------M bits -----> + * +----------------+---------------+---------------------+ + * |00..000000000000| PNODE | NodeOffset | + * +----------------+---------------+---------------------+ + * <--- N bits --->|<--------M bits -----> * - * M number of node offset bits (35 .. 40) - * N number of SNASID bits (0 .. 10) + * M - number of node offset bits (35 .. 40) + * N - number of PNODE bits (0 .. 10) * * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64). * The actual values are configuration dependent and are set at - * boot time + * boot time. M & N values are set by the hardware/BIOS at boot. + * * * APICID format * NOTE!!!!!! This is the current format of the APICID. However, code @@ -48,14 +71,14 @@ * * 1111110000000000 * 5432109876543210 - * nnnnnnnnnnlc0cch + * pppppppppplc0cch * sssssssssss * - * n = snasid bits + * p = pnode bits * l = socket number on board * c = core * h = hyperthread - * s = bits that are in the socket CSR + * s = bits that are in the SOCKET_ID CSR * * Note: Processor only supports 12 bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. @@ -74,7 +97,7 @@ * This value is also the value of the maximum number of non-router NASIDs * in the numalink fabric. * - * NOTE: a brick may be 1 or 2 OS nodes. Don't get these confused. + * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused. */ #define UV_MAX_NUMALINK_BLADES 16384 @@ -96,8 +119,12 @@ */ struct uv_hub_info_s { unsigned long global_mmr_base; - unsigned short local_nasid; - unsigned short gnode_upper; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; unsigned short coherency_domain_number; unsigned short numa_blade_id; unsigned char blade_processor_id; @@ -112,83 +139,124 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * Local & Global MMR space macros. * Note: macros are intended to be used ONLY by inline functions * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ -#define UV_SNASID(n) ((n) >> 1) -#define UV_NASID(n) ((n) << 1) +#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) +#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) -#define UV_GLOBAL_MMR32_SNASID_MASK 0x3ff -#define UV_GLOBAL_MMR32_SNASID_SHIFT 15 -#define UV_GLOBAL_MMR64_SNASID_SHIFT 26 +#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 +#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 -#define UV_GLOBAL_MMR32_NASID_BITS(n) \ - (((UV_SNASID(n) & UV_GLOBAL_MMR32_SNASID_MASK)) << \ - (UV_GLOBAL_MMR32_SNASID_SHIFT)) +#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) -#define UV_GLOBAL_MMR64_NASID_BITS(n) \ - ((unsigned long)UV_SNASID(n) << UV_GLOBAL_MMR64_SNASID_SHIFT) +#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ + ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + +#define UV_APIC_PNODE_SHIFT 6 + +/* + * Macros for converting between kernel virtual addresses, socket local physical + * addresses, and UV global physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. + */ + +/* socket phys RAM --> UV global physical address */ +static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) +{ + if (paddr < uv_hub_info->lowmem_remap_top) + paddr += uv_hub_info->lowmem_remap_base; + return paddr | uv_hub_info->gnode_upper; +} + + +/* socket virtual --> UV global physical address */ +static inline unsigned long uv_gpa(void *v) +{ + return __pa(v) | uv_hub_info->gnode_upper; +} + +/* socket virtual --> UV global physical address */ +static inline void *uv_vgpa(void *v) +{ + return (void *)uv_gpa(v); +} + +/* UV global physical address --> socket virtual */ +static inline void *uv_va(unsigned long gpa) +{ + return __va(gpa & uv_hub_info->gpa_mask); +} + +/* pnode, offset --> socket virtual */ +static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) +{ + return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset); +} -#define UV_APIC_NASID_SHIFT 6 /* - * Extract a NASID from an APICID (full apicid, not processor subset) + * Extract a PNODE from an APICID (full apicid, not processor subset) */ -static inline int uv_apicid_to_nasid(int apicid) +static inline int uv_apicid_to_pnode(int apicid) { - return (UV_NASID(apicid >> UV_APIC_NASID_SHIFT)); + return (apicid >> UV_APIC_PNODE_SHIFT); } /* * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int nasid, +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | - UV_GLOBAL_MMR32_NASID_BITS(nasid) | offset); + UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int nasid, unsigned long offset, +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr32_address(nasid, offset) = val; + *uv_global_mmr32_address(pnode, offset) = val; } -static inline unsigned long uv_read_global_mmr32(int nasid, +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { - return *uv_global_mmr32_address(nasid, offset); + return *uv_global_mmr32_address(pnode, offset); } /* * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int nasid, +static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | - UV_GLOBAL_MMR64_NASID_BITS(nasid) | offset); + UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int nasid, unsigned long offset, +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { - *uv_global_mmr64_address(nasid, offset) = val; + *uv_global_mmr64_address(pnode, offset) = val; } -static inline unsigned long uv_read_global_mmr64(int nasid, +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { - return *uv_global_mmr64_address(nasid, offset); + return *uv_global_mmr64_address(pnode, offset); } /* - * Access node local MMRs. Faster than using global space but only local MMRs + * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. */ static inline unsigned long *uv_local_mmr_address(unsigned long offset) @@ -207,15 +275,15 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) } /* - * Structures and definitions for converting between cpu, node, and blade + * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. */ struct uv_blade_info { - unsigned short nr_posible_cpus; + unsigned short nr_possible_cpus; unsigned short nr_online_cpus; - unsigned short nasid; + unsigned short pnode; }; -struct uv_blade_info *uv_blade_info; +extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; extern short *uv_cpu_to_blade; extern short uv_possible_blades; @@ -244,16 +312,16 @@ static inline int uv_node_to_blade_id(int nid) return uv_node_to_blade[nid]; } -/* Convert a blade id to the NASID of the blade */ -static inline int uv_blade_to_nasid(int bid) +/* Convert a blade id to the PNODE of the blade */ +static inline int uv_blade_to_pnode(int bid) { - return uv_blade_info[bid].nasid; + return uv_blade_info[bid].pnode; } /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { - return uv_blade_info[bid].nr_posible_cpus; + return uv_blade_info[bid].nr_possible_cpus; } /* Determine the number of online cpus on a blade */ @@ -262,16 +330,16 @@ static inline int uv_blade_nr_online_cpus(int bid) return uv_blade_info[bid].nr_online_cpus; } -/* Convert a cpu id to the NASID of the blade containing the cpu */ -static inline int uv_cpu_to_nasid(int cpu) +/* Convert a cpu id to the PNODE of the blade containing the cpu */ +static inline int uv_cpu_to_pnode(int cpu) { - return uv_blade_info[uv_cpu_to_blade_id(cpu)].nasid; + return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode; } -/* Convert a node number to the NASID of the blade */ -static inline int uv_node_to_nasid(int nid) +/* Convert a linux node number to the PNODE of the blade */ +static inline int uv_node_to_pnode(int nid) { - return uv_blade_info[uv_node_to_blade_id(nid)].nasid; + return uv_blade_info[uv_node_to_blade_id(nid)].pnode; } /* Maximum possible number of blades */ diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index 3b69fe6..ac98460 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -11,11 +11,46 @@ #ifndef __ASM_X86_UV_MMRS__ #define __ASM_X86_UV_MMRS__ -/* - * AUTO GENERATED - Do not edit - */ +#define UV_MMR_ENABLE (1UL << 63) - #define UV_MMR_ENABLE (1UL << 63) +/* ========================================================================= */ +/* UVH_BAU_DATA_CONFIG */ +/* ========================================================================= */ +#define UVH_BAU_DATA_CONFIG 0x61680UL +#define UVH_BAU_DATA_CONFIG_32 0x0450 + +#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 +#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 +#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 +#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 +#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_BAU_DATA_CONFIG_P_SHFT 13 +#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_BAU_DATA_CONFIG_T_SHFT 15 +#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_BAU_DATA_CONFIG_M_SHFT 16 +#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 +#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_bau_data_config_u { + unsigned long v; + struct uvh_bau_data_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; /* ========================================================================= */ /* UVH_IPI_INT */ @@ -109,6 +144,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0aa0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL @@ -169,6 +205,7 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0aa8 /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ @@ -248,6 +285,331 @@ union uvh_lb_bau_sb_descriptor_base_u { }; /* ========================================================================= */ +/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */ +/* ========================================================================= */ +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL + +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_SHFT 22 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_MASK 0x0000000000400000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 23 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000000800000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 24 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000001000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 25 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000002000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 26 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000004000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 27 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000008000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 28 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000010000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 29 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000020000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 30 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000040000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 31 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000080000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 32 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000100000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 33 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000200000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 34 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000400000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 35 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000000800000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 36 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000001000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 37 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000002000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 38 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000004000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 39 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000008000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 40 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000010000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 41 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000020000000000UL + +union uvh_lb_mcast_aoerr0_rpt_enable_u { + unsigned long v; + struct uvh_lb_mcast_aoerr0_rpt_enable_s { + unsigned long mcast_obese_msg : 1; /* RW */ + unsigned long mcast_data_sb_err : 1; /* RW */ + unsigned long mcast_nack_buff_parity : 1; /* RW */ + unsigned long mcast_timeout : 1; /* RW */ + unsigned long mcast_inactive_reply : 1; /* RW */ + unsigned long mcast_upgrade_error : 1; /* RW */ + unsigned long mcast_reg_count_underflow : 1; /* RW */ + unsigned long mcast_rep_obese_msg : 1; /* RW */ + unsigned long ucache_req_runt_msg : 1; /* RW */ + unsigned long ucache_req_obese_msg : 1; /* RW */ + unsigned long ucache_req_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_runt_msg : 1; /* RW */ + unsigned long ucache_rep_obese_msg : 1; /* RW */ + unsigned long ucache_rep_data_sb_err : 1; /* RW */ + unsigned long ucache_rep_command_err : 1; /* RW */ + unsigned long ucache_pend_timeout : 1; /* RW */ + unsigned long macc_req_runt_msg : 1; /* RW */ + unsigned long macc_req_obese_msg : 1; /* RW */ + unsigned long macc_req_data_sb_err : 1; /* RW */ + unsigned long macc_rep_runt_msg : 1; /* RW */ + unsigned long macc_rep_obese_msg : 1; /* RW */ + unsigned long macc_rep_data_sb_err : 1; /* RW */ + unsigned long macc_timeout : 1; /* RW */ + unsigned long macc_spurious_event : 1; /* RW */ + unsigned long ioh_destination_table_parity : 1; /* RW */ + unsigned long get_had_error_reply : 1; /* RW */ + unsigned long get_timeout : 1; /* RW */ + unsigned long lock_manager_had_error_reply : 1; /* RW */ + unsigned long put_had_error_reply : 1; /* RW */ + unsigned long put_timeout : 1; /* RW */ + unsigned long sb_activation_overrun : 1; /* RW */ + unsigned long completed_gb_activation_had_error_reply : 1; /* RW */ + unsigned long completed_gb_activation_timeout : 1; /* RW */ + unsigned long descriptor_buffer_0_parity : 1; /* RW */ + unsigned long descriptor_buffer_1_parity : 1; /* RW */ + unsigned long socket_destination_table_parity : 1; /* RW */ + unsigned long bau_reply_payload_corruption : 1; /* RW */ + unsigned long io_port_destination_table_parity : 1; /* RW */ + unsigned long intd_soft_ack_timeout : 1; /* RW */ + unsigned long int_rep_obese_msg : 1; /* RW */ + unsigned long int_rep_command_err : 1; /* RW */ + unsigned long int_timeout : 1; /* RW */ + unsigned long rsvd_42_63 : 22; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_CONFIG */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_CONFIG 0x61000UL + +#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8 +#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13 +#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15 +#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16 +#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_local_int0_config_u { + unsigned long v; + struct uvh_local_int0_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_LOCAL_INT0_ENABLE */ +/* ========================================================================= */ +#define UVH_LOCAL_INT0_ENABLE 0x65000UL + +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0 +#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1 +#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2 +#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3 +#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4 +#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5 +#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6 +#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14 +#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15 +#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16 +#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17 +#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18 +#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19 +#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20 +#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21 +#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38 +#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39 +#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40 +#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41 +#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42 +#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43 +#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44 +#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL + +union uvh_local_int0_enable_u { + unsigned long v; + struct uvh_local_int0_enable_s { + unsigned long lb_hcerr : 1; /* RW */ + unsigned long gr0_hcerr : 1; /* RW */ + unsigned long gr1_hcerr : 1; /* RW */ + unsigned long lh_hcerr : 1; /* RW */ + unsigned long rh_hcerr : 1; /* RW */ + unsigned long xn_hcerr : 1; /* RW */ + unsigned long si_hcerr : 1; /* RW */ + unsigned long lb_aoerr0 : 1; /* RW */ + unsigned long gr0_aoerr0 : 1; /* RW */ + unsigned long gr1_aoerr0 : 1; /* RW */ + unsigned long lh_aoerr0 : 1; /* RW */ + unsigned long rh_aoerr0 : 1; /* RW */ + unsigned long xn_aoerr0 : 1; /* RW */ + unsigned long si_aoerr0 : 1; /* RW */ + unsigned long lb_aoerr1 : 1; /* RW */ + unsigned long gr0_aoerr1 : 1; /* RW */ + unsigned long gr1_aoerr1 : 1; /* RW */ + unsigned long lh_aoerr1 : 1; /* RW */ + unsigned long rh_aoerr1 : 1; /* RW */ + unsigned long xn_aoerr1 : 1; /* RW */ + unsigned long si_aoerr1 : 1; /* RW */ + unsigned long rh_vpi_int : 1; /* RW */ + unsigned long system_shutdown_int : 1; /* RW */ + unsigned long lb_irq_int_0 : 1; /* RW */ + unsigned long lb_irq_int_1 : 1; /* RW */ + unsigned long lb_irq_int_2 : 1; /* RW */ + unsigned long lb_irq_int_3 : 1; /* RW */ + unsigned long lb_irq_int_4 : 1; /* RW */ + unsigned long lb_irq_int_5 : 1; /* RW */ + unsigned long lb_irq_int_6 : 1; /* RW */ + unsigned long lb_irq_int_7 : 1; /* RW */ + unsigned long lb_irq_int_8 : 1; /* RW */ + unsigned long lb_irq_int_9 : 1; /* RW */ + unsigned long lb_irq_int_10 : 1; /* RW */ + unsigned long lb_irq_int_11 : 1; /* RW */ + unsigned long lb_irq_int_12 : 1; /* RW */ + unsigned long lb_irq_int_13 : 1; /* RW */ + unsigned long lb_irq_int_14 : 1; /* RW */ + unsigned long lb_irq_int_15 : 1; /* RW */ + unsigned long l1_nmi_int : 1; /* RW */ + unsigned long stop_clock : 1; /* RW */ + unsigned long asic_to_l1 : 1; /* RW */ + unsigned long l1_to_asic : 1; /* RW */ + unsigned long ltc_int : 1; /* RW */ + unsigned long la_seq_trigger : 1; /* RW */ + unsigned long rsvd_45_63 : 19; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL @@ -284,6 +646,73 @@ union uvh_node_id_u { }; /* ========================================================================= */ +/* UVH_NODE_PRESENT_TABLE */ +/* ========================================================================= */ +#define UVH_NODE_PRESENT_TABLE 0x1400UL +#define UVH_NODE_PRESENT_TABLE_DEPTH 16 + +#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + +union uvh_node_present_table_u { + unsigned long v; + struct uvh_node_present_table_s { + unsigned long nodes : 64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_0_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_1_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL + +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +union uvh_rh_gam_alias210_redirect_config_2_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23 : 24; /* */ + unsigned long dest_base : 22; /* RW */ + unsigned long rsvd_46_63: 18; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL @@ -369,5 +798,77 @@ union uvh_si_addr_map_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL + +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias0_overlay_config_u { + unsigned long v; + struct uvh_si_alias0_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL + +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias1_overlay_config_u { + unsigned long v; + struct uvh_si_alias1_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ +/* ========================================================================= */ +#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL + +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 +#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL + +union uvh_si_alias2_overlay_config_u { + unsigned long v; + struct uvh_si_alias2_overlay_config_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + #endif /* __ASM_X86_UV_MMRS__ */ -- cgit v1.1 From 15ce60056b24a65b65e28de973a9fd8ac0750a2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 2 Jun 2008 13:20:11 +0200 Subject: xen: export get_phys_to_machine -tip testing found the following xen-console symbols trouble: ERROR: "get_phys_to_machine" [drivers/video/xen-fbfront.ko] undefined! ERROR: "get_phys_to_machine" [drivers/net/xen-netfront.ko] undefined! ERROR: "get_phys_to_machine" [drivers/input/xen-kbdfront.ko] undefined! with: http://redhat.com/~mingo/misc/config-Mon_Jun__2_12_25_13_CEST_2008.bad --- arch/x86/xen/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eef3b5c..17f374e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -136,6 +136,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) idx = p2m_index(pfn); return p2m_top[topidx][idx]; } +EXPORT_SYMBOL_GPL(get_phys_to_machine); static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) { -- cgit v1.1 From e2426cf85f8db5891fb5831323d2d0c176c4dadc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Sat, 31 May 2008 01:24:27 +0100 Subject: xen: avoid hypercalls when updating unpinned pud/pmd When operating on an unpinned pagetable (ie, one under construction or destruction), it isn't necessary to use a hypercall to update a pud/pmd entry. Jan Beulich observed that a similar optimisation avoided many thousands of hypercalls while doing a kernel build. One tricky part is that early in the kernel boot there's no page structure, so we can't check to see if the page is pinned. In that case, we just always use the hypercall. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/enlighten.c | 14 +++++++++++--- arch/x86/xen/mmu.c | 39 +++++++++++++++++++++++++++++++++++---- arch/x86/xen/mmu.h | 8 ++++---- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b94f63a..ed9f04b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -901,6 +901,14 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } +static __init void xen_post_allocator_init(void) +{ + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; + + xen_mark_init_mm_pinned(); +} + /* This is called once we have the cpu_possible_map */ void xen_setup_vcpu_info_placement(void) { @@ -988,7 +996,7 @@ static const struct pv_init_ops xen_init_ops __initdata = { .banner = xen_banner, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, - .post_allocator_init = xen_mark_init_mm_pinned, + .post_allocator_init = xen_post_allocator_init, }; static const struct pv_time_ops xen_time_ops __initdata = { @@ -1100,7 +1108,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte = NULL, /* see xen_pagetable_setup_* */ .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd, + .set_pmd = xen_set_pmd_hyper, .pte_val = xen_pte_val, .pte_flags = native_pte_val, @@ -1111,7 +1119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud, + .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 17f374e..4fa0934 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -223,7 +223,14 @@ void make_lowmem_page_readwrite(void *vaddr) } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static bool page_pinned(void *ptr) +{ + struct page *page = virt_to_page(ptr); + + return PagePinned(page); +} + +void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) { struct multicall_space mcs; struct mmu_update *u; @@ -241,6 +248,18 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) preempt_enable(); } +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pmd_hyper(ptr, val); +} + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. @@ -348,7 +367,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) return ret; } -void xen_set_pud(pud_t *ptr, pud_t val) +void xen_set_pud_hyper(pud_t *ptr, pud_t val) { struct multicall_space mcs; struct mmu_update *u; @@ -366,6 +385,18 @@ void xen_set_pud(pud_t *ptr, pud_t val) preempt_enable(); } +void xen_set_pud(pud_t *ptr, pud_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pud_hyper(ptr, val); +} + void xen_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; @@ -387,7 +418,7 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) void xen_pmd_clear(pmd_t *pmdp) { - xen_set_pmd(pmdp, __pmd(0)); + set_pmd(pmdp, __pmd(0)); } pmd_t xen_make_pmd(pmdval_t pmd) @@ -758,7 +789,7 @@ void xen_exit_mmap(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (PagePinned(virt_to_page(mm->pgd))) + if (page_pinned(mm->pgd)) xen_pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe961c..e3dd09e 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -25,10 +25,6 @@ enum pt_level { void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); @@ -45,10 +41,14 @@ pte_t xen_make_pte(pteval_t); pmd_t xen_make_pmd(pmdval_t); pgd_t xen_make_pgd(pgdval_t); +void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); +void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); +void xen_set_pud_hyper(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); -- cgit v1.1 From 9c7a794209f8a91f47697c3be20597eb60531e6d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Sat, 31 May 2008 01:33:02 +0100 Subject: xen: restore vcpu_info mapping If we're using vcpu_info mapping, then make sure its restored on all processors before relasing them from stop_machine. The only complication is that if this fails, we can't continue because we've already made assumptions that the mapping is available (baked in calls to the _direct versions of the functions, for example). Fortunately this can only happen with a 32-bit hypervisor, which may possibly run out of mapping space. On a 64-bit hypervisor, this is a non-issue. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/enlighten.c | 30 +++++++++++++++++++++++++++++- arch/x86/xen/suspend.c | 4 +++- arch/x86/xen/xen-ops.h | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ed9f04b..8e6152e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -98,7 +98,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; -static void __init xen_vcpu_setup(int cpu) +static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -136,6 +136,34 @@ static void __init xen_vcpu_setup(int cpu) } } +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + if (have_vcpu_info_placement) { + int cpu; + + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); + + xen_vcpu_setup(cpu); + + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); + } + + BUG_ON(!have_vcpu_info_placement); + } +} + static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7620a16..7ab10f6 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -26,6 +26,8 @@ void xen_pre_suspend(void) void xen_post_suspend(int suspend_cancelled) { + xen_setup_shared_info(); + if (suspend_cancelled) { xen_start_info->store_mfn = pfn_to_mfn(xen_start_info->store_mfn); @@ -35,8 +37,8 @@ void xen_post_suspend(int suspend_cancelled) #ifdef CONFIG_SMP xen_cpu_initialized_map = cpu_online_map; #endif + xen_vcpu_restore(); } - xen_setup_shared_info(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a0503ac..a457e03 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -26,6 +26,7 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit v1.1 From d07af1f0e3a3e378074fc36322dd7b0e72d9a3e2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Sat, 31 May 2008 01:33:03 +0100 Subject: xen: resume timers on all vcpus On resume, the vcpu timer modes will not be restored. The timer infrastructure doesn't do this for us, since it assumes the cpus are offline. We can just poke the other vcpus into the right mode directly though. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/suspend.c | 1 + arch/x86/xen/time.c | 13 +++++++++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 15 insertions(+) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 7ab10f6..251669a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,6 +38,7 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); + xen_timer_resume(); } } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5..ea137fb 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,6 +572,19 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } +void xen_timer_resume(void) +{ + int cpu; + + if (xen_clockevent != &xen_vcpuop_clockevent) + return; + + for_each_online_cpu(cpu) { + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + } +} + __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a457e03..9a05559 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -37,6 +37,7 @@ void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.1 From 7e0edc1bc343231029084761ebf59e522902eb49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Sat, 31 May 2008 01:33:04 +0100 Subject: xen: add new Xen elfnote types and use them appropriately Define recently added XEN_ELFNOTEs, and use them appropriately. Most significantly, this enables domain checkpointing (xm save -c). Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/xen-head.S | 5 +++++ include/xen/interface/elfnote.h | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 2ab5f42..ef6c9e0 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -7,6 +7,7 @@ #include <linux/init.h> #include <asm/boot.h> #include <xen/interface/elfnote.h> +#include <asm/xen/interface.h> __INIT ENTRY(startup_xen) @@ -32,5 +33,9 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, + .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) #endif /*CONFIG_XEN */ diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index a64d3df..7a8262c 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -120,6 +120,26 @@ */ #define XEN_ELFNOTE_BSD_SYMTAB 11 +/* + * The lowest address the hypervisor hole can begin at (numeric). + * + * This must not be set higher than HYPERVISOR_VIRT_START. Its presence + * also indicates to the hypervisor that the kernel can deal with the + * hole starting at a higher address. + */ +#define XEN_ELFNOTE_HV_START_LOW 12 + +/* + * List of maddr_t-sized mask/value pairs describing how to recognize + * (non-present) L1 page table entries carrying valid MFNs (numeric). + */ +#define XEN_ELFNOTE_L1_MFN_VALID 13 + +/* + * Whether or not the guest supports cooperative suspend cancellation. + */ +#define XEN_ELFNOTE_SUSPEND_CANCEL 14 + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* -- cgit v1.1 From 3d1ba1da2b4ff4ace7801e99fb9a3095b182d847 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 2 Jun 2008 13:50:10 +0200 Subject: x86: fix nmi.c build bug apic.h needs to be included for the apic_write_around() definition. --- arch/x86/kernel/nmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bf143f1..cbd4fa3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -11,6 +11,8 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ +#include <asm/apic.h> + #include <linux/nmi.h> #include <linux/mm.h> #include <linux/delay.h> -- cgit v1.1 From 4c1cbafb88490757a38119c41229251369bcecbc Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 09:28:52 +0200 Subject: x86 mpparse: build fix fix this build bug: drivers/acpi/pci_irq.c: In function 'acpi_pci_irq_enable': drivers/acpi/pci_irq.c:574: error: implicit declaration of function 'mp_config_acpi_gsi' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/acpi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h index 14411c9..73ce5b3 100644 --- a/include/asm-x86/acpi.h +++ b/include/asm-x86/acpi.h @@ -28,6 +28,7 @@ #include <asm/numa.h> #include <asm/processor.h> #include <asm/mmu.h> +#include <asm/mpspec.h> #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long -- cgit v1.1 From 2772f54bf37b033263abe4a2314c40a308a1a5cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 10:09:45 +0200 Subject: x86: add acpi_numa_slit_init() dummy implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed on 32-bit. --- arch/x86/mm/discontig_32.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 435c343..d28987a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -454,3 +454,12 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifdef CONFIG_ACPI_NUMA +/* + * Dummy on 32-bit, for now: + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} +#endif -- cgit v1.1 From b28852d6703e4b72ce363c5168ea8d3fb28b9c57 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 10:11:07 +0200 Subject: x86: add dummy acpi_numa_processor_affinity_init() implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed. --- arch/x86/mm/discontig_32.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index d28987a..bb8544a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -462,4 +462,9 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { } + +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ +} #endif -- cgit v1.1 From f3294690979634ee10398bb0beadfe1d4edb881d Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 10:16:10 +0200 Subject: x86, numaq: add pci_acpi_scan_root() stub allow 32-bit numaq build to succeed with ACPI enabled. --- arch/x86/kernel/numaq_32.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index e65281b..992f53c 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -87,3 +87,14 @@ static int __init numaq_tsc_disable(void) return 0; } arch_initcall(numaq_tsc_disable); + +#ifdef CONFIG_ACPI +/* + * Dummy implementation: + */ +struct pci_bus * __devinit +pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) +{ + return NULL; +} +#endif -- cgit v1.1 From cf3d0cb8a10a4fd19439bb34995d9ad28854739a Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 11:36:56 +0200 Subject: x86: 32-bit numa, build fix on Summit it's possible to have: CONFIG_ACPI_SRAT=y CONFIG_HAVE_ARCH_PARSE_SRAT=y in which case acpi.h defines the acpi_numa_slit_init() and acpi_numa_processor_affinity_init() methods as a macro. --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index bb8544a..0d28b4b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -455,7 +455,7 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_ACPI_NUMA +#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) /* * Dummy on 32-bit, for now: */ -- cgit v1.1 From 471b3c1b011f807d16f1e19d1d4ecf703f1e7d1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 12:39:43 +0200 Subject: x86, numaq 32-bit: build fix fix: drivers/built-in.o: In function `acpi_numa_init': : undefined reference to `acpi_numa_arch_fixup' which can happen with ACPI && NUMAQ. --- arch/x86/mm/discontig_32.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 0d28b4b..8b4eac0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -467,4 +467,8 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } + +void __init acpi_numa_arch_fixup(void) +{ +} #endif -- cgit v1.1 From ba924c81dd5a7a7fb5ded025af7fdd3b61f8ca67 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 31 May 2008 22:51:51 -0700 Subject: x86, numa, 32-bit: increase max_elements to 1024 so every element will represent 64M instead of 256M. AMD opteron could have HW memory hole remapping, so could have [0, 8g + 64M) on node0. Reduce element size to 64M to keep that on node 0 Later we need to use find_e820_area() to allocate memory_node_map like on 64-bit. But need to move memory_present out of populate_mem_map... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 14 +++++++------- include/asm-x86/mmzone_32.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 55fdbab..922af20 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, * if the first gig is on node 0, and the second gig is on node 1 * physnode_map will contain: * - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; */ s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; EXPORT_SYMBOL(physnode_map); @@ -81,9 +81,9 @@ void memory_present(int nid, unsigned long start, unsigned long end) printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk("%ld ", pfn); + printk(KERN_CONT "%ld ", pfn); } - printk("\n"); + printk(KERN_CONT "\n"); } unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index faef751..ab00128 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -51,14 +51,14 @@ extern int early_pfn_to_nid(unsigned long pfn); /* * generic node memory support, the following assumptions apply: * - * 1) memory comes in 256Mb contigious chunks which are either present or not + * 1) memory comes in 64Mb contigious chunks which are either present or not * 2) we will not have more than 64Gb in total * * for now assume that 64Gb is max amount of RAM for whole system * 64Gb / 4096bytes/page = 16777216 pages */ #define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 256 +#define MAX_ELEMENTS 1024 #define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) extern s8 physnode_map[]; -- cgit v1.1 From b66cd7207387b9b428aaf1988e21dd263c6a4928 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 31 May 2008 22:53:47 -0700 Subject: x86: set node_remap_size[0] in fallback path ... otherwise alloc_remap will not get node_mem_map from kva area, and alloc_node_mem_map has to alloc_bootmem_node to get mem_map. It will use two low address copies ... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 922af20..98b099e 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -124,6 +124,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); -- cgit v1.1 From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 31 May 2008 22:52:47 -0700 Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn() we don't need to call memory_present that early. numa and sparse will call memory_present later and might even fail, it will call memory_present for the full range. also for sparse it will call alloc_bootmem ... before we set up bootmem. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_32.c | 5 ++--- arch/x86/kernel/setup_32.c | 4 ++-- arch/x86/mm/discontig_32.c | 2 +- include/asm-x86/e820_32.h | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index db760d4..0c025d0 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -210,7 +210,7 @@ void __init init_iomem_resources(struct resource *code_resource, /* * Find the highest page frame number we have available */ -void __init propagate_e820_map(void) +void __init find_max_pfn(void) { int i; @@ -227,7 +227,6 @@ void __init propagate_e820_map(void) continue; if (end > max_pfn) max_pfn = end; - memory_present(0, start, end); } } @@ -361,7 +360,7 @@ static int __init parse_memmap(char *arg) * size before original memory map is * reset. */ - propagate_e820_map(); + find_max_pfn(); saved_max_pfn = max_pfn; #endif e820.nr_map = 0; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 6f40cb5..2901042 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -730,10 +730,10 @@ void __init setup_arch(char **cmdline_p) efi_init(); /* update e820 for memory not covered by WB MTRRs */ - propagate_e820_map(); + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - propagate_e820_map(); + find_max_pfn(); max_low_pfn = setup_memory(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 98b099e..ebbbba3 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - propagate_e820_map(); + find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 7ace825..00fbc60 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,7 +21,7 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern void propagate_e820_map(void); +extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, -- cgit v1.1 From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 9 +++++++-- mm/page_alloc.c | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index ebbbba3..3150ad3 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -168,6 +168,8 @@ static void __init allocate_pgdat(int nid) reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), "NODE_DATA"); } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); } #ifdef CONFIG_DISCONTIGMEM @@ -208,8 +210,12 @@ void __init remap_numa_kva(void) int node; for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); + printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", + (unsigned long)vaddr, + node_remap_start_pfn[node] + pfn); set_pmd_pfn((ulong) vaddr, node_remap_start_pfn[node] + pfn, PAGE_KERNEL_LARGE); @@ -293,8 +299,7 @@ static void init_remap_allocator(int nid) printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); + (ulong) node_remap_end_vaddr[nid]); } #else void *alloc_remap(int nid, unsigned long size) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6383557..502223c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3486,6 +3486,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif free_area_init_core(pgdat, zones_size, zholes_size); } -- cgit v1.1 From 2944e16b25e7fb8b5ee0dd9dc7197a0f9e523cfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 1 Jun 2008 13:17:38 -0700 Subject: x86: update mptable make mptable to be consistent with acpi routing, so we could: 1. kexec kernel with acpi=off 2. work around BIOSes where acpi routing is working, but mptable is not right, so can use kernel/kexec to start other OSes that don't have good acpi support. command line: update_mptable Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 22 +++ arch/x86/kernel/e820.c | 25 +++ arch/x86/kernel/mpparse.c | 401 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/setup_64.c | 4 + drivers/acpi/pci_irq.c | 5 + include/asm-x86/e820.h | 1 + include/asm-x86/mpspec.h | 4 + 7 files changed, 445 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2ddfaba..f226bdc 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1154,6 +1154,28 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } +int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity) +{ + struct mpc_config_intsrc intsrc; + int ioapic; + + /* print the entry should happen on mptable identically */ + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + intsrc.mpc_srcbus = number; + intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id; + intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; + + MP_intsrc_info(&intsrc); + + return 0; +} + /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0cd9132..cd2b99e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -739,3 +739,28 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) return -1UL; } + +/* + * pre allocated 4k and reserved it in e820 + */ +u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +{ + u64 size = 0; + u64 addr; + u64 start; + + start = startt; + while (size < sizet) + start = find_e820_area_size(start, &size, align); + + if (size < sizet) + return 0; + + addr = round_down(start + size - sizet, align); + update_memory_range(addr, sizet, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820 for early_reserve_e820\n"); + update_e820(); + + return addr; +} + diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9f3792d..8898aa4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -25,6 +25,8 @@ #include <asm/proto.h> #include <asm/acpi.h> #include <asm/bios_ebda.h> +#include <asm/e820.h> +#include <asm/trampoline.h> #include <mach_apic.h> #ifdef CONFIG_X86_32 @@ -161,20 +163,81 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) nr_ioapics++; } -static void __init MP_intsrc_info(struct mpc_config_intsrc *m) +static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_INFO "Int: type %d, pol %d, trig %d, bus %02x," + printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - mp_irqs[mp_irq_entries].mp_dstapic = m->mpc_dstapic; - mp_irqs[mp_irq_entries].mp_type = m->mpc_type; - mp_irqs[mp_irq_entries].mp_irqtype = m->mpc_irqtype; - mp_irqs[mp_irq_entries].mp_irqflag = m->mpc_irqflag; - mp_irqs[mp_irq_entries].mp_srcbus = m->mpc_srcbus; - mp_irqs[mp_irq_entries].mp_srcbusirq = m->mpc_srcbusirq; - mp_irqs[mp_irq_entries].mp_dstirq = m->mpc_dstirq; +} + +static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) +{ + printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, + (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, + mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); +} + +static void assign_to_mp_irq(struct mpc_config_intsrc *m, + struct mp_config_intsrc *mp_irq) +{ + mp_irq->mp_dstapic = m->mpc_dstapic; + mp_irq->mp_type = m->mpc_type; + mp_irq->mp_irqtype = m->mpc_irqtype; + mp_irq->mp_irqflag = m->mpc_irqflag; + mp_irq->mp_srcbus = m->mpc_srcbus; + mp_irq->mp_srcbusirq = m->mpc_srcbusirq; + mp_irq->mp_dstirq = m->mpc_dstirq; +} + +static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, + struct mpc_config_intsrc *m) +{ + m->mpc_dstapic = mp_irq->mp_dstapic; + m->mpc_type = mp_irq->mp_type; + m->mpc_irqtype = mp_irq->mp_irqtype; + m->mpc_irqflag = mp_irq->mp_irqflag; + m->mpc_srcbus = mp_irq->mp_srcbus; + m->mpc_srcbusirq = mp_irq->mp_srcbusirq; + m->mpc_dstirq = mp_irq->mp_dstirq; +} + +static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, + struct mpc_config_intsrc *m) +{ + if (mp_irq->mp_dstapic != m->mpc_dstapic) + return 1; + if (mp_irq->mp_type != m->mpc_type) + return 2; + if (mp_irq->mp_irqtype != m->mpc_irqtype) + return 3; + if (mp_irq->mp_irqflag != m->mpc_irqflag) + return 4; + if (mp_irq->mp_srcbus != m->mpc_srcbus) + return 5; + if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq) + return 6; + if (mp_irq->mp_dstirq != m->mpc_dstirq) + return 7; + + return 0; +} + +void MP_intsrc_info(struct mpc_config_intsrc *m) +{ + int i; + + print_MP_intsrc_info(m); + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } @@ -268,12 +331,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, * Read/parse the MPC */ -static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem, + char *str) { - char str[16]; - char oem[10]; - int count = sizeof(*mpc); - unsigned char *mpt = ((unsigned char *)mpc) + count; if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", @@ -301,13 +361,28 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) memcpy(str, mpc->mpc_productid, 12); str[12] = 0; -#ifdef CONFIG_X86_32 - mps_oem_check(mpc, oem, str); -#endif printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); + return 1; +} + +static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifdef CONFIG_X86_32 + mps_oem_check(mpc, oem, str); +#endif + /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -785,3 +860,295 @@ void __init find_smp_config(void) { __find_smp_config(1); } + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m) +{ + int i; + + if (m->mpc_irqtype != mp_INT) + return 0; + + if (m->mpc_irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].mp_irqtype != mp_INT) + continue; + + if (mp_irqs[i].mp_irqflag != 0x0f) + continue; + + if (mp_irqs[i].mp_srcbus != m->mpc_srcbus) + continue; + if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; +#endif + +static int __init replace_intsrc_all(struct mp_config_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; + int nr_m_spare = 0; +#endif + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length); + while (count < mpc->mpc_length) { + switch (*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m = + (struct mpc_config_processor *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m = + (struct mpc_config_bus *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + mpt += sizeof(struct mpc_config_ioapic); + count += sizeof(struct mpc_config_ioapic); + break; + } + case MP_INTSRC: + { +#ifdef CONFIG_X86_IO_APIC + struct mpc_config_intsrc *m = + (struct mpc_config_intsrc *)mpt; + + printk(KERN_INFO "OLD "); + print_MP_intsrc_info(m); + i = get_MP_intsrc_index(m); + if (i > 0) { + assign_to_mpc_intsrc(&mp_irqs[i], m); + printk(KERN_INFO "NEW "); + print_mp_irq_info(&mp_irqs[i]); + } else if (!i) { + /* legacy, do nothing */ + } else if (nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) + * are invalid entries, + * we need to use the slot later + */ + m_spare[nr_m_spare] = m; + nr_m_spare++; + } +#endif + mpt += sizeof(struct mpc_config_intsrc); + count += sizeof(struct mpc_config_intsrc); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m = + (struct mpc_config_lintsrc *)mpt; + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + default: + /* wrong mptable */ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); + printk(KERN_ERR "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->mpc_length, 1); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].mp_irqtype != mp_INT) + continue; + + if (mp_irqs[i].mp_irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + printk(KERN_INFO "*NEW* found "); + nr_m_spare--; + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_config_intsrc *m = + (struct mpc_config_intsrc *)mpt; + count += sizeof(struct mpc_config_intsrc); + if (!mpc_new_phys) { + printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); + } else { + if (count <= mpc_new_length) + printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); + else { + printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); + goto out; + } + } + assign_to_mpc_intsrc(&mp_irqs[i], m); + mpc->mpc_length = count; + mpt += sizeof(struct mpc_config_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->mpc_checksum = 0; + mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc, + mpc->mpc_length); + + return 0; +} + +int __initdata enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) { + u64 startt = 0; +#ifdef CONFIG_X86_TRAMPOLINE + startt = TRAMPOLINE_BASE; +#endif + mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); + } +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct intel_mp_floating *mpf; + struct mp_config_table *mpc; + struct mp_config_table *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->mpf_feature1 != 0) + return 0; + + if (!mpf->mpf_physptr) + return 0; + + mpc = phys_to_virt(mpf->mpf_physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); + printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); + + if (mpc_new_phys && mpc->mpc_length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the postion */ + mpc->mpc_checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); + mpc->mpc_checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-positon replacing\n"); + } else { + mpf->mpf_physptr = mpc_new_phys; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->mpc_length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_phys - mpf->mpf_physptr) { + struct intel_mp_floating *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = phys_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->mpf_physptr = mpc_new_phys; + } + mpf->mpf_checksum = 0; + mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 89e6cca..978a0d6 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -56,6 +56,7 @@ #include <asm/desc.h> #include <video/edid.h> #include <asm/e820.h> +#include <asm/mpspec.h> #include <asm/dma.h> #include <asm/gart.h> #include <asm/mpspec.h> @@ -381,6 +382,9 @@ void __init setup_arch(char **cmdline_p) * we are rounding upwards: */ end_pfn = e820_end_of_ram(); + + /* pre allocte 4k for mptable mpc */ + early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(end_pfn)) { diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 89022a7..e556f30 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -570,6 +570,11 @@ int acpi_pci_irq_enable(struct pci_dev *dev) (triggering == ACPI_LEVEL_SENSITIVE) ? "level" : "edge", (polarity == ACPI_ACTIVE_LOW) ? "low" : "high", dev->irq); +#ifdef CONFIG_X86 + mp_config_acpi_gsi(dev->bus->number, dev->devfn, dev->pin, irq, + triggering, polarity); +#endif + return 0; } diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 4266a2c..ee8fe4c 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -84,6 +84,7 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); +extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); #endif /* __ASSEMBLY__ */ diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index b785ddd..6a34d6d 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -39,6 +39,7 @@ extern unsigned long mp_lapic_addr; extern void find_smp_config(void); extern void get_smp_config(void); +extern void early_reserve_e820_mpc_new(void); void __cpuinit generic_processor_info(int apicid, int version); #ifdef CONFIG_ACPI @@ -47,6 +48,9 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +extern void MP_intsrc_info(struct mpc_config_intsrc *m); +extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity); #endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) -- cgit v1.1 From 287572cb38de7f270b59191a0fecfa5c5de7765d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 1 Jun 2008 21:06:31 -0700 Subject: x86, numa, 32-bit: avoid clash between ramdisk and kva use find_e820_area to get address space... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3150ad3..73a9834 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -38,6 +38,7 @@ #include <asm/setup.h> #include <asm/mmzone.h> #include <asm/bios_ebda.h> +#include <asm/proto.h> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -326,7 +327,6 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; - unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -337,29 +337,18 @@ unsigned long __init setup_memory(void) */ get_memcfg_numa(); - kva_pages = calculate_numa_remap_pages(); + kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_start_pfn = find_max_low_pfn() - kva_pages; - -#ifdef CONFIG_BLK_DEV_INITRD - /* Numa kva area is below the initrd */ - if (initrd_start) - kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - - kva_pages; -#endif - - /* - * We waste pages past at the end of the KVA for no good reason other - * than how it is located. This is bad. - */ - wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); - kva_start_pfn -= wasted_pages; - kva_pages += wasted_pages; - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + kva_start_pfn = find_e820_area(kva_start_pfn<<PAGE_SHIFT, + max_low_pfn<<PAGE_SHIFT, + kva_pages<<PAGE_SHIFT, + PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; + printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); -- cgit v1.1 From 6af61a7614a306fe882a0c2b4ddc63b65aa66efc Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 1 Jun 2008 23:53:50 -0700 Subject: x86: clean up max_pfn_mapped usage - 32-bit on 32-bit in head_32.S after initial page table is done, we get initial max_pfn_mapped, and then kernel_physical_mapping_init will give us a final one. We need to use that to make sure find_e820_area will get valid addresses for boot_map and for NODE_DATA(0) on numa32. XEN PV and lguest may need to assign max_pfn_mapped too. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_32.S | 4 ++++ arch/x86/kernel/setup_32.c | 4 +++- arch/x86/mm/discontig_32.c | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bef4618..ac7002f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -220,6 +220,8 @@ default_entry: jb 10b 1: movl %edi,pa(init_pg_tables_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax @@ -251,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2901042..c985a8c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -586,7 +586,7 @@ void __init setup_bootmem_allocator(void) */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, bootmap_size, + max_pfn_mapped<<PAGE_SHIFT, bootmap_size, PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); @@ -595,6 +595,8 @@ void __init setup_bootmem_allocator(void) reserve_initrd(); #endif bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: %08lx - %08lx\n", min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); printk(KERN_INFO " bootmap %08lx - %08lx\n", diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 73a9834..914a81e 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -163,7 +163,8 @@ static void __init allocate_pgdat(int nid) else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, sizeof(pg_data_t), + (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT, + sizeof(pg_data_t), PAGE_SIZE); NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), -- cgit v1.1 From c8c034ce79418d2143c00c4cf751cfa51701f788 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 1 Jun 2008 23:55:37 -0700 Subject: x86: clean up max_pfn_mapped usage - 64-bit on 64-bit we only get valid max_pfn_mapped after init_memory_mapping(). Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_64.c | 15 +++------------ arch/x86/kernel/setup_64.c | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 5e063e7..a11bec2 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -55,16 +55,12 @@ unsigned long __init e820_end_of_ram(void) last_pfn = find_max_pfn_with_active_regions(); - if (last_pfn > max_pfn_mapped) - max_pfn_mapped = last_pfn; - if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) - max_pfn_mapped = MAXMEM>>PAGE_SHIFT; + if (last_pfn > MAXMEM>>PAGE_SHIFT) + last_pfn = MAXMEM>>PAGE_SHIFT; if (last_pfn > end_user_pfn) last_pfn = end_user_pfn; - if (last_pfn > max_pfn_mapped) - last_pfn = max_pfn_mapped; - printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); + printk(KERN_INFO "last_pfn = %lu\n", last_pfn); return last_pfn; } @@ -109,10 +105,6 @@ static int __init e820_find_active_region(const struct e820entry *ei, if (*ei_startpfn >= *ei_endpfn) return 0; - /* Check if max_pfn_mapped should be updated */ - if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped) - max_pfn_mapped = *ei_endpfn; - /* Skip if map is outside the node */ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) @@ -229,7 +221,6 @@ static int __init parse_memmap_opt(char *p) saved_max_pfn = e820_end_of_ram(); remove_all_active_ranges(); #endif - max_pfn_mapped = 0; e820.nr_map = 0; userdef = 1; return 0; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 978a0d6..2599b27 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -396,7 +396,7 @@ void __init setup_arch(char **cmdline_p) check_efer(); - max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT)); + max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT)); if (efi_enabled) efi_init(); -- cgit v1.1 From 835fc943f34139ed062f1ac194b52ed3b7123d88 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 3 Jun 2008 14:42:06 +0200 Subject: x86: mp build fix fix: drivers/built-in.o: In function `acpi_pci_irq_enable': : undefined reference to `mp_config_acpi_gsi' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mpspec.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 6a34d6d..dbbf363 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -49,8 +49,17 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); extern void MP_intsrc_info(struct mpc_config_intsrc *m); +#ifdef CONFIG_X86_IO_APIC extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); +#else +static inline int +mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity) +{ + return 0; +} +#endif #endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) -- cgit v1.1 From d44b9d17faf7bca165ce73a1acb936b65a3f0cc6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Date: Tue, 3 Jun 2008 13:06:07 -0700 Subject: x86: move bugs_64.c to cpu/bugs_64.c It looks good to move bugs_64.c to cpu/bugs_64.c. Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/bugs_64.c | 33 --------------------------------- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/bugs_64.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 34 deletions(-) delete mode 100644 arch/x86/kernel/bugs_64.c create mode 100644 arch/x86/kernel/cpu/bugs_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3..4934fec 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o obj-y += bootflag.o e820_$(BITS).o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-$(CONFIG_X86_64) += bugs_64.o obj-y += tsc_$(BITS).o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c deleted file mode 100644 index 9a3ed06..0000000 --- a/arch/x86/kernel/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> -#include <asm/cacheflush.h> - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c77a1c5..65b1be5 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -6,6 +6,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o +obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c new file mode 100644 index 0000000..9a3ed06 --- /dev/null +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -0,0 +1,33 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/alternative.h> +#include <asm/bugs.h> +#include <asm/processor.h> +#include <asm/mtrr.h> +#include <asm/cacheflush.h> + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +} -- cgit v1.1 From f19dc2f22a180dde3f9e611b76c73f5390c11ecd Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 10:24:49 -0700 Subject: x86: change propagate_e820_map() back to find_max_pfn(), 32-bit, fix add memory_present() calls for sparse and non-numa. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index c985a8c..ccf3595 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -377,11 +377,13 @@ static unsigned long __init setup_memory(void) if (max_pfn > max_low_pfn) { highstart_pfn = max_low_pfn; } + memory_present(0, 0, highend_pfn); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else + memory_present(0, 0, max_low_pfn); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif -- cgit v1.1 From ab530e1f781da4d704892daab2bdd568f473687d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 10:25:54 -0700 Subject: x86: early check if a system is numaq so we could fall back to one node numa. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mpparse.c | 14 +++++++++++--- arch/x86/kernel/numaq_32.c | 24 +++++++++++++++++++----- include/asm-x86/mpspec.h | 4 ++-- include/asm-x86/numaq.h | 1 + 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8898aa4..b4a950d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -70,7 +70,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) return; } #ifdef CONFIG_X86_NUMAQ - apicid = mpc_apic_id(m, translation_table[mpc_record]); + if (found_numaq) + apicid = mpc_apic_id(m, translation_table[mpc_record]); + else + apicid = m->mpc_apicid; #else apicid = m->mpc_apicid; #endif @@ -91,7 +94,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m) str[6] = 0; #ifdef CONFIG_X86_NUMAQ - mpc_oem_bus_info(m, str, translation_table[mpc_record]); + if (found_numaq) + mpc_oem_bus_info(m, str, translation_table[mpc_record]); #else printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); #endif @@ -112,7 +116,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { #ifdef CONFIG_X86_NUMAQ - mpc_oem_pci_bus(m, translation_table[mpc_record]); + if (found_numaq) + mpc_oem_pci_bus(m, translation_table[mpc_record]); #endif clear_bit(m->mpc_busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined (CONFIG_MCA) @@ -321,6 +326,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, { if (strncmp(oem, "IBM NUMA", 8)) printk("Warning! May not be a NUMA-Q system!\n"); + else + found_numaq = 1; + if (mpc->mpc_oemptr) smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, mpc->mpc_oemsize); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index e65281b..922be66 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,9 +31,12 @@ #include <asm/numaq.h> #include <asm/topology.h> #include <asm/processor.h> +#include <asm/mpspec.h> #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) +int found_numaq; + /* * Function: smp_dump_qct() * @@ -67,13 +70,24 @@ static void __init smp_dump_qct(void) } } -/* - * Unlike Summit, we don't really care to let the NUMA-Q - * fall back to flat mode. Don't compile for NUMA-Q - * unless you really need it! - */ +static __init void early_check_numaq(void) +{ + /* + * Find possible boot-time SMP configuration: + */ + early_find_smp_config(); + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); +} + int __init get_memcfg_numaq(void) { + early_check_numaq(); + if (!found_numaq) + return 0; smp_dump_qct(); return 1; } diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index dbbf363..f38b68e 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -21,11 +21,11 @@ extern int pic_mode; /* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) +#endif + extern void early_find_smp_config(void); extern void early_get_smp_config(void); -#endif - #if defined(CONFIG_MCA) || defined(CONFIG_EISA) extern int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h index 94b86c3..739d164 100644 --- a/include/asm-x86/numaq.h +++ b/include/asm-x86/numaq.h @@ -28,6 +28,7 @@ #ifdef CONFIG_X86_NUMAQ +extern int found_numaq; extern int get_memcfg_numaq(void); /* -- cgit v1.1 From 84b56fa46b36c2df508e7d421feab514fad30f81 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 19:32:30 -0700 Subject: x86, numa, 32-bit: make sure get we kva space when 1/3 user/kernel split is used, and less memory is installed, or if we have a big hole below 4g, max_low_pfn is still using 3g-128m try to go down from max_low_pfn until we get it. otherwise will panic. need to make 32-bit code to use register_e820_active_regions ... later. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914a81e..7ced26a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -328,6 +328,7 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -344,11 +345,17 @@ unsigned long __init setup_memory(void) system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - kva_start_pfn = find_e820_area(kva_start_pfn<<PAGE_SHIFT, - max_low_pfn<<PAGE_SHIFT, - kva_pages<<PAGE_SHIFT, - PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, + max_low_pfn<<PAGE_SHIFT, + kva_pages<<PAGE_SHIFT, + PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); -- cgit v1.1 From ee0c80fadfa56bf4f9d90c1c023429a6bd8edd69 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 19:34:00 -0700 Subject: x86: move e820_register_active() to e820.c to prepare 32-bit to use it. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_64.c | 97 ----------------------------------------- include/asm-x86/e820.h | 11 +++++ include/asm-x86/e820_64.h | 5 --- 4 files changed, 120 insertions(+), 102 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cd2b99e..c140f73 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -764,3 +764,112 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) return addr; } +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM<<PAGE_SHIFT +#endif + +/* + * Last pfn which the user wants to use. + */ +unsigned long __initdata end_user_pfn = MAX_ARCH_PFN; + +/* + * Find the highest page frame number we have available + */ +unsigned long __init e820_end_of_ram(void) +{ + unsigned long last_pfn; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + last_pfn = find_max_pfn_with_active_regions(); + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + if (last_pfn > end_user_pfn) + last_pfn = end_user_pfn; + + printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n", + last_pfn, max_arch_pfn); + return last_pfn; +} + +/* + * Finds an active region in the address range from start_pfn to last_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long last_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) +{ + u64 align = PAGE_SIZE; + + *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= last_pfn) + return 0; + + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > last_pfn) + *ei_endpfn = last_pfn; + + /* Obey end_user_pfn to save on memmap */ + if (*ei_startpfn >= end_user_pfn) + return 0; + if (*ei_endpfn > end_user_pfn) + *ei_endpfn = end_user_pfn; + + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long last_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); +} + +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +u64 __init e820_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long last_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn, ei_endpfn, ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, last_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - ((u64)ram << PAGE_SHIFT); +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index a11bec2..0afee2c 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -42,29 +42,6 @@ unsigned long end_pfn; unsigned long max_pfn_mapped; /* - * Last pfn which the user wants to use. - */ -static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; - -/* - * Find the highest page frame number we have available - */ -unsigned long __init e820_end_of_ram(void) -{ - unsigned long last_pfn; - - last_pfn = find_max_pfn_with_active_regions(); - - if (last_pfn > MAXMEM>>PAGE_SHIFT) - last_pfn = MAXMEM>>PAGE_SHIFT; - if (last_pfn > end_user_pfn) - last_pfn = end_user_pfn; - - printk(KERN_INFO "last_pfn = %lu\n", last_pfn); - return last_pfn; -} - -/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) @@ -88,80 +65,6 @@ void __init e820_reserve_resources(void) } } -/* - * Finds an active region in the address range from start_pfn to last_pfn and - * returns its range in ei_startpfn and ei_endpfn for the e820 entry. - */ -static int __init e820_find_active_region(const struct e820entry *ei, - unsigned long start_pfn, - unsigned long last_pfn, - unsigned long *ei_startpfn, - unsigned long *ei_endpfn) -{ - *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; - *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; - - /* Skip map entries smaller than a page */ - if (*ei_startpfn >= *ei_endpfn) - return 0; - - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || - *ei_startpfn >= last_pfn) - return 0; - - /* Check for overlaps */ - if (*ei_startpfn < start_pfn) - *ei_startpfn = start_pfn; - if (*ei_endpfn > last_pfn) - *ei_endpfn = last_pfn; - - /* Obey end_user_pfn to save on memmap */ - if (*ei_startpfn >= end_user_pfn) - return 0; - if (*ei_endpfn > end_user_pfn) - *ei_endpfn = end_user_pfn; - - return 1; -} - -/* Walk the e820 map and register active regions within a node */ -void __init -e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned long ei_startpfn; - unsigned long ei_endpfn; - int i; - - for (i = 0; i < e820.nr_map; i++) - if (e820_find_active_region(&e820.map[i], - start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - add_active_range(nid, ei_startpfn, ei_endpfn); -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -unsigned long __init e820_hole_size(unsigned long start, unsigned long end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long last_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn, ei_endpfn, ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - if (e820_find_active_region(&e820.map[i], - start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - ram += ei_endpfn - ei_startpfn; - } - return end - start - (ram << PAGE_SHIFT); -} - static void early_panic(char *msg) { early_printk(msg); diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index ee8fe4c..44ed9c0 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -79,6 +79,8 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn) } #endif +extern unsigned long end_user_pfn; + extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); @@ -86,6 +88,15 @@ extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +extern unsigned long e820_end_of_ram(void); +extern int e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long last_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn); +extern void e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); +extern u64 e820_hole_size(u64 start, u64 end); #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 917963c..368585d 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -16,16 +16,11 @@ #ifndef __ASSEMBLY__ extern void setup_memory_region(void); extern void contig_e820_setup(void); -extern unsigned long e820_end_of_ram(void); extern void e820_reserve_resources(void); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); extern int is_memory_all_valid(unsigned long start, unsigned long end); -extern unsigned long e820_hole_size(unsigned long start, unsigned long end); - -extern void e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern void finish_e820_parsing(void); -- cgit v1.1 From 7b2a0a6c4866cac146dcb0433e6984eb19a81335 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 19:35:04 -0700 Subject: x86: make 32-bit use e820_register_active_regions() this way 32-bit is more similar to 64-bit, and smarter e820 and numa. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_32.c | 68 ++-------------------------------------------- arch/x86/kernel/numaq_32.c | 3 ++ arch/x86/kernel/setup_32.c | 24 ++++++++++++---- arch/x86/kernel/srat_32.c | 4 ++- arch/x86/mm/discontig_32.c | 19 ++++--------- include/asm-x86/e820_32.h | 2 -- 6 files changed, 33 insertions(+), 87 deletions(-) diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 0c025d0..e8a3b96 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -207,69 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource, } } -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - } -} - -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; @@ -360,8 +297,9 @@ static int __init parse_memmap(char *arg) * size before original memory map is * reset. */ - find_max_pfn(); - saved_max_pfn = max_pfn; + e820_register_active_regions(0, 0, -1UL); + saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); #endif e820.nr_map = 0; user_defined_memmap = 1; diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 922be66..27b9082 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -32,6 +32,7 @@ #include <asm/topology.h> #include <asm/processor.h> #include <asm/mpspec.h> +#include <asm/e820.h> #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -61,6 +62,8 @@ static void __init smp_dump_qct(void) node_end_pfn[node] = MB_TO_PAGES( eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); + e820_register_active_regions(node, node_start_pfn[node], + node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); node_remap_size[node] = node_memmap_size_bytes(node, diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index ccf3595..0ec6480 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -405,11 +405,12 @@ static void __init zone_sizes_init(void) max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + remove_all_active_ranges(); #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - add_active_range(0, 0, highend_pfn); + e820_register_active_regions(0, 0, highend_pfn); #else - add_active_range(0, 0, max_low_pfn); + e820_register_active_regions(0, 0, max_low_pfn); #endif free_area_init_nodes(max_zone_pfns); @@ -582,6 +583,7 @@ static void __init relocate_initrd(void) void __init setup_bootmem_allocator(void) { + int i; unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): @@ -603,7 +605,8 @@ void __init setup_bootmem_allocator(void) min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); printk(KERN_INFO " bootmap %08lx - %08lx\n", bootmap, bootmap + bootmap_size); - register_bootmem_low_pages(max_low_pfn); + for_each_online_node(i) + free_bootmem_with_active_regions(i, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); #ifdef CONFIG_ACPI_SLEEP @@ -733,11 +736,20 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); + e820_register_active_regions(0, 0, -1UL); + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + max_pfn = e820_end_of_ram(); + /* update e820 for memory not covered by WB MTRRs */ - find_max_pfn(); mtrr_bp_init(); - if (mtrr_trim_uncached_memory(max_pfn)) - find_max_pfn(); + if (mtrr_trim_uncached_memory(max_pfn)) { + remove_all_active_ranges(); + e820_register_active_regions(0, 0, -1UL); + max_pfn = e820_end_of_ram(); + } max_low_pfn = setup_memory(); diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 88971ee..32d8b11 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -31,6 +31,7 @@ #include <asm/srat.h> #include <asm/topology.h> #include <asm/smp.h> +#include <asm/e820.h> /* * proximity macros and definitions @@ -244,7 +245,8 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); - add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); + e820_register_active_regions(chunk->nid, chunk->start_pfn, + min(chunk->end_pfn, max_pfn)); } for_each_online_node(nid) { diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7ced26a..a89ccf3 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,10 +120,9 @@ int __init get_memcfg_numa_flat(void) { printk("NUMA - single node, flat memory mode\n"); - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); @@ -337,6 +336,11 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ + + /* call find_max_low_pfn at first, it could update max_pfn */ + system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + + remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -344,7 +348,6 @@ unsigned long __init setup_memory(void) /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, @@ -402,7 +405,6 @@ unsigned long __init setup_memory(void) void __init zone_sizes_init(void) { - int nid; unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = @@ -412,15 +414,6 @@ void __init zone_sizes_init(void) max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; #endif - /* If SRAT has not registered memory, register it now */ - if (find_max_pfn_with_active_regions() == 0) { - for_each_online_node(nid) { - if (node_has_online_mem(nid)) - add_active_range(nid, node_start_pfn[nid], - node_end_pfn[nid]); - } - } - free_area_init_nodes(max_zone_pfns); return; } diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 00fbc60..212b74c 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,8 +21,6 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern void find_max_pfn(void); -extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, -- cgit v1.1 From bd70e522afce2f7837d081dc52f261ecf9d4d2d5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 4 Jun 2008 13:21:29 -0700 Subject: x86: e820 max_arch_pfn typo fix for 64 bit should use right shift Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c140f73..5d33b9c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -771,7 +771,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) # define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) # endif #else /* CONFIG_X86_32 */ -# define MAX_ARCH_PFN MAXMEM<<PAGE_SHIFT +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT #endif /* -- cgit v1.1 From fa5b8a30cf03520737e9a0ee2ee03a61b2eccf05 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Mon, 26 May 2008 20:40:47 +0200 Subject: aperture_64.c: duplicated code, buggy? Hi! void __init early_gart_iommu_check(void) contains for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; loop, with very similar loop duplicated in void __init gart_iommu_hole_init(void) . First copy of a loop seems to be buggy, too. It uses 0 as a "nothing set" value, which may actually bite us in last_aper_enabled case (because it may be often zero). (Beware, it is hard to test this patch, because this code has about 2^8 different code paths, depending on hardware and cmdline settings). Plus, the second loop does not check for consistency of aper_enabled. Should it? Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6eea42e..e5b17f9 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -271,16 +271,16 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int fix, slot; + int i, fix, slot; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; - int aper_enabled = 0, last_aper_enabled = 0; - int i; + int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; if (!early_pci_allowed()) return; + /* This is mostly duplicate of iommu_hole_init */ fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; @@ -301,19 +301,22 @@ void __init early_gart_iommu_check(void) aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base) || - (last_aper_enabled && aper_enabled != last_aper_enabled)) { - fix = 1; - goto out; + if (last_valid) { + if ((aper_order != last_aper_order) || + (aper_base != last_aper_base) || + (aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } } + last_aper_order = aper_order; last_aper_base = aper_base; last_aper_enabled = aper_enabled; + last_valid = 1; } } -out: if (!fix && !aper_enabled) return; -- cgit v1.1 From 4f384f8bcdb5d618a0a68fb84c809e602c798b8f Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Mon, 26 May 2008 21:17:30 +0200 Subject: x86: aperture_64.c: corner case wrong If fix == 0, aper_enabled == 1, gart_fix_e820 == 0 if (!fix && !aper_enabled) return; if (gart_fix_e820 && !fix && aper_enabled) { if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); update_e820(); } return; } /* different nodes have different setting, disable them all atfirst*/ we'll fall back here and disable all the settings, even when they were all consistent. What about this? (I hope it compiles...) Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: Dave Jones <davej@codemonkey.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/aperture_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index e5b17f9..eb20f16 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -331,9 +331,11 @@ void __init early_gart_iommu_check(void) add_memory_region(aper_base, aper_size, E820_RESERVED); update_e820(); } - return; } + if (!fix) + return; + /* different nodes have different setting, disable them all at first*/ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; -- cgit v1.1 From d3fbe5ea9518b46a68e6b278974e92e2c3acef4a Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Mon, 2 Jun 2008 14:26:14 +0800 Subject: x86: split out common code into find_overlapped_early() This patch clean up reserve_early() family functions by extracting the common part of reserve_early(), free_early() and bad_addr() into find_overlapped_early(). Signed-off-by: Huang Ying <ying.huang@intel.com> Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/e820.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5d33b9c..ac5e9eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -558,20 +558,34 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = { {} }; -void __init reserve_early(u64 start, u64 end, char *name) +static int __init find_overlapped_early(u64 start, u64 end) { int i; struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { r = &early_res[i]; if (end > r->start && start < r->end) - panic("Overlapping early reservations %llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name?name:"", r->start, - r->end - 1, r->name); + break; } + + return i; +} + +void __init reserve_early(u64 start, u64 end, char *name) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); if (i >= MAX_EARLY_RES) panic("Too many early reservations"); r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name?name:"", r->start, + r->end - 1, r->name); r->start = start; r->end = end; if (name) @@ -583,14 +597,11 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i, j; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - r = &early_res[i]; - if (start == r->start && end == r->end) - break; - } - if (i >= MAX_EARLY_RES || !early_res[i].end) + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= MAX_EARLY_RES || r->end != end || r->start != start) panic("free_early on not reserved area: %llx-%llx!", - start, end); + start, end - 1); for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) ; @@ -626,17 +637,16 @@ void __init early_res_to_bootmem(u64 start, u64 end) static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) { int i; - u64 addr = *addrp, last; + u64 addr = *addrp; int changed = 0; + struct early_res *r; again: - last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last >= r->start && addr < r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < MAX_EARLY_RES && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; } return changed; } -- cgit v1.1 From d0ec2c6f2c2f0478b34ae78b3e65f60a561ac807 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Mon, 2 Jun 2008 14:26:18 +0800 Subject: x86: reserve highmem pages via reserve_early This patch makes early reserved highmem pages become reserved pages. This can be used for highmem pages allocated by bootloader such as EFI memory map, linked list of setup_data, etc. Signed-off-by: Huang Ying <ying.huang@intel.com> Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/e820.c | 11 +++++++++++ arch/x86/mm/init_32.c | 3 ++- include/asm-x86/e820.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ac5e9eb..a706e90 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,6 +612,17 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } +int __init page_is_reserved_early(unsigned long pagenr) +{ + u64 start = (u64)pagenr << PAGE_SHIFT; + int i; + struct early_res *r; + + i = find_overlapped_early(start, start + PAGE_SIZE); + r = &early_res[i]; + return (i < MAX_EARLY_RES && r->end); +} + void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10..0e7bb5e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -289,7 +289,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && + !page_is_reserved_early(pfn)) { ClearPageReserved(page); init_page_count(page); __free_page(page); diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 44ed9c0..8aa3232 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,6 +86,7 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); +extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); -- cgit v1.1 From ecacf09f7d26b2317e8b1d59fa40f62081fad0bb Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Mon, 2 Jun 2008 14:26:21 +0800 Subject: x86: reserve EFI memory map with reserve_early This patch reserves the EFI memory map with reserve_early(). Because EFI memory map is allocated by bootloader, if it is not reserved by reserved_early(), it may be overwritten through address returned by find_e820_area(). Signed-off-by: Huang Ying <ying.huang@intel.com> Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/efi.c | 33 ++++++++++++++++++++------------- arch/x86/kernel/efi_64.c | 6 ------ arch/x86/kernel/setup_32.c | 5 ++++- arch/x86/kernel/setup_64.c | 7 +++---- include/asm-x86/efi.h | 2 +- 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 4a1a26d..d5c7fcd 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -238,6 +238,23 @@ static void __init add_efi_memmap(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +void __init efi_reserve_early(void) +{ + unsigned long pmap; + + pmap = boot_params.efi_info.efi_memmap; +#ifdef CONFIG_X86_64 + pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32; +#endif + memmap.phys_map = (void *)pmap; + memmap.nr_map = boot_params.efi_info.efi_memmap_size / + boot_params.efi_info.efi_memdesc_size; + memmap.desc_version = boot_params.efi_info.efi_memdesc_version; + memmap.desc_size = boot_params.efi_info.efi_memdesc_size; + reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, + "EFI memmap"); +} + #if EFI_DEBUG static void __init print_efi_memmap(void) { @@ -267,21 +284,11 @@ void __init efi_init(void) int i = 0; void *tmp; -#ifdef CONFIG_X86_32 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; - memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; -#else - efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); - memmap.phys_map = (void *) - (boot_params.efi_info.efi_memmap | - ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); +#ifdef CONFIG_X86_64 + efi_phys.systab = (void *)efi_phys.systab + + ((__u64)boot_params.efi_info.efi_systab_hi<<32); #endif - memmap.nr_map = boot_params.efi_info.efi_memmap_size / - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; efi.systab = early_ioremap((unsigned long)efi_phys.systab, sizeof(efi_system_table_t)); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 21a7b75..652c528 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -97,12 +97,6 @@ void __init efi_call_phys_epilog(void) early_runtime_code_mapping_set_exec(0); } -void __init efi_reserve_bootmem(void) -{ - reserve_bootmem_generic((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); -} - void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped __initdata; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 0ec6480..2960cbe 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -67,6 +67,7 @@ #include <asm/bios_ebda.h> #include <asm/cacheflush.h> #include <asm/processor.h> +#include <asm/efi.h> /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* @@ -683,8 +684,10 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) + "EL32", 4)) { efi_enabled = 1; + efi_reserve_early(); + } #endif ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 2599b27..078c02f 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -330,8 +330,10 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL64", 4)) + "EL64", 4)) { efi_enabled = 1; + efi_reserve_early(); + } #endif ARCH_SETUP @@ -457,9 +459,6 @@ void __init setup_arch(char **cmdline_p) acpi_reserve_bootmem(); #endif - if (efi_enabled) - efi_reserve_bootmem(); - #ifdef CONFIG_X86_MPPARSE /* * Find and reserve possible boot-time SMP configuration: diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h index d53004b..7ed2bd7 100644 --- a/include/asm-x86/efi.h +++ b/include/asm-x86/efi.h @@ -90,7 +90,7 @@ extern void *efi_ioremap(unsigned long addr, unsigned long size); #endif /* CONFIG_X86_32 */ -extern void efi_reserve_bootmem(void); +extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); -- cgit v1.1 From 0c51a965ed3c44dd50497e74492a015680e49899 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Mon, 2 Jun 2008 14:26:23 +0800 Subject: x86: extract common part of head32.c and head64.c into head.c This patch extracts the common part of head32.c and head64.c into head.c. Signed-off-by: Huang Ying <ying.huang@intel.com> Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Makefile | 1 + arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/head.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/head32.c | 50 ----------------------------------------- arch/x86/kernel/head64.c | 50 ----------------------------------------- include/asm-x86/bios_ebda.h | 2 ++ 6 files changed, 59 insertions(+), 101 deletions(-) create mode 100644 arch/x86/kernel/head.c diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3cff3c8..30b046b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -160,6 +160,7 @@ KBUILD_AFLAGS += $(mflags-y) head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o +head-y += arch/x86/kernel/head.o head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369a4e..7e01ce3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c new file mode 100644 index 0000000..e0d0ce5 --- /dev/null +++ b/arch/x86/kernel/head.c @@ -0,0 +1,55 @@ +#include <linux/kernel.h> +#include <linux/init.h> + +#include <asm/setup.h> +#include <asm/bios_ebda.h> + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + reserve_early(lowmem, 0x100000, "BIOS reserved"); +} diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 0b6cb9f..fa1d25d 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -13,56 +13,6 @@ #include <asm/e820.h> #include <asm/bios_ebda.h> -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_early(lowmem, 0x100000, "BIOS reserved"); -} - void __init i386_start_kernel(void) { reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b..f1773c8 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -51,56 +51,6 @@ static void __init copy_bootdata(char *real_mode_data) } } -#define BIOS_LOWMEM_KILOBYTES 0x413 - -/* - * The BIOS places the EBDA/XBDA at the top of conventional - * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. - */ -static void __init reserve_ebda_region(void) -{ - unsigned int lowmem, ebda_addr; - - /* To determine the position of the EBDA and the */ - /* end of conventional memory, we need to look at */ - /* the BIOS data area. In a paravirtual environment */ - /* that area is absent. We'll just have to assume */ - /* that the paravirt case can handle memory setup */ - /* correctly, without our help. */ - if (paravirt_enabled()) - return; - - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - - /* Fixup: bios puts an EBDA in the top 64K segment */ - /* of conventional memory, but does not adjust lowmem. */ - if ((lowmem - ebda_addr) <= 0x10000) - lowmem = ebda_addr; - - /* Fixup: bios does not report an EBDA at all. */ - /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ - if ((ebda_addr == 0) && (lowmem >= 0x9f000)) - lowmem = 0x9f000; - - /* Paranoia: should never happen, but... */ - if ((lowmem == 0) || (lowmem >= 0x100000)) - lowmem = 0x9f000; - - /* reserve all memory between lowmem and the 1MB mark */ - reserve_early(lowmem, 0x100000, "BIOS reserved"); -} - static void __init reserve_setup_data(void) { struct setup_data *data; diff --git a/include/asm-x86/bios_ebda.h b/include/asm-x86/bios_ebda.h index b4a46b7..0033e50 100644 --- a/include/asm-x86/bios_ebda.h +++ b/include/asm-x86/bios_ebda.h @@ -14,4 +14,6 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } +void reserve_ebda_region(void); + #endif /* _MACH_BIOS_EBDA_H */ -- cgit v1.1 From c45a707dbe35cb9aa6490223e5b1129fa3583948 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Mon, 2 Jun 2008 14:26:25 +0800 Subject: x86: linked list of setup_data for i386 This patch adds linked list of struct setup_data supported for i386. Signed-off-by: Huang Ying <ying.huang@intel.com> Cc: andi@firstfloor.org Cc: mingo@redhat.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/head.c | 18 ++++++++++++++++++ arch/x86/kernel/head64.c | 18 ------------------ arch/x86/kernel/setup.c | 22 ++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 3 +++ arch/x86/kernel/setup_64.c | 22 ---------------------- include/asm-x86/bootparam.h | 3 +++ 6 files changed, 46 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index e0d0ce5..a727c0b 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -53,3 +53,21 @@ void __init reserve_ebda_region(void) /* reserve all memory between lowmem and the 1MB mark */ reserve_early(lowmem, 0x100000, "BIOS reserved"); } + +void __init reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f1773c8..5fbed45 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -51,24 +51,6 @@ static void __init copy_bootdata(char *real_mode_data) } } -static void __init reserve_setup_data(void) -{ - struct setup_data *data; - unsigned long pa_data; - char buf[32]; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); - pa_data = data->next; - early_iounmap(data, sizeof(*data)); - } -} - void __init x86_64_start_kernel(char * real_mode_data) { int i; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b85..0a281f2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -137,3 +137,25 @@ void __init setup_per_cpu_areas(void) } #endif + +void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2960cbe..ee1ccdb 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -681,6 +681,7 @@ void __init setup_arch(char **cmdline_p) pre_setup_arch_hook(); early_cpu_init(); early_ioremap_init(); + reserve_setup_data(); #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, @@ -729,6 +730,8 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; + parse_setup_data(); + parse_early_param(); finish_e820_parsing(); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 078c02f..adf3b04 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -272,28 +272,6 @@ void __attribute__((weak)) __init memory_setup(void) machine_specific_memory_setup(); } -static void __init parse_setup_data(void) -{ - struct setup_data *data; - unsigned long pa_data; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); - switch (data->type) { - default: - break; - } -#ifndef CONFIG_DEBUG_BOOT_PARAMS - free_early(pa_data, pa_data+sizeof(*data)+data->len); -#endif - pa_data = data->next; - early_iounmap(data, PAGE_SIZE); - } -} - #ifdef CONFIG_PCI_MMCONFIG extern void __cpuinit fam10h_check_enable_mmcfg(void); extern void __init check_enable_amd_mmconf_dmi(void); diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index f62f473..0a07390 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -106,4 +106,7 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); +void reserve_setup_data(void); +void parse_setup_data(void); + #endif /* _ASM_BOOTPARAM_H */ -- cgit v1.1 From 74e411cb6443d8bcb55fbe89fcc7a9ee574df91b Mon Sep 17 00:00:00 2001 From: Krzysztof Oledzki <ole@ans.pl> Date: Wed, 4 Jun 2008 03:40:17 +0200 Subject: x86: add another PCI ID for ICH6 force hpet. Tested on Asus P5GDC-V $ lspci -n -n |grep ISA 00:1f.0 ISA bridge [0601]: Intel Corporation 82801FB/FR (ICH6/ICH6R) LPC Interface Bridge [8086:2640] (rev 03) Force enabled HPET at base address 0xfed00000 hpet clockevent registered hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0 hpet0: 3 64-bit timers, 14318180 Hz Signed-off-by: Krzysztof Piotr Oledzki <ole@ans.pl> Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index ddbf34d..79bdcd1 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -159,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, -- cgit v1.1 From 3ed3f06295e69700fa808396f7b350bff2b69de0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Wed, 4 Jun 2008 01:00:47 +0400 Subject: x86: nmi - consolidate nmi_watchdog_default for 32bit mode 64bit mode bootstrap code does set nmi_watchdog to NMI_NONE by default and doing the same on 32bit mode is safe too. Such an action saves us from several #ifdef. Btw, my previous commit commit 19ec673ced067316b9732bc6d1c4ff4052e5f795 Author: Cyrill Gorcunov <gorcunov@gmail.com> Date: Wed May 28 23:00:47 2008 +0400 x86: nmi - fix incorrect NMI watchdog used by default did not fix the problem completely, moreover it introduced additional bug - nmi_watchdog would be set to either NMI_LOCAL_APIC or NMI_IO_APIC _regardless_ to boot option if being enabled thru /proc/sys/kernel/nmi_watchdog. Sorry for that. Fix it too. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: macro@linux-mips.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi.c | 14 ++++++++------ include/asm-x86/nmi.h | 4 +--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cbd4fa3..27ca8f6 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -487,14 +487,16 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return -EIO; } -#ifdef CONFIG_X86_64 /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); -#else - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; + +#ifdef CONFIG_X86_32 + if (nmi_watchdog == NMI_NONE) { + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } #endif if (nmi_watchdog == NMI_LOCAL_APIC) { diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 972a4f6..470bb4a 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -38,12 +38,10 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev) #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); -extern void nmi_watchdog_default(void); -#else -#define nmi_watchdog_default() do { } while (0) #endif extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); +extern void nmi_watchdog_default(void); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); -- cgit v1.1 From 1a1b1d1322ebd1ece405f3057cdd408bc77e391d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Wed, 4 Jun 2008 01:00:58 +0400 Subject: x86: watchdog - check for CPU is being supported This patch does check if CPU is being recongnized before call the unreserve(). Since enable_lapic_nmi_watchdog() does have such a check the same is make sense here too in a sake of code consistency (but nothing more). Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: macro@linux-mips.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f9ae93a..ddda4b6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -181,7 +181,9 @@ void disable_lapic_nmi_watchdog(void) return; on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - wd_ops->unreserve(); + + if (wd_ops) + wd_ops->unreserve(); BUG_ON(atomic_read(&nmi_active) != 0); } -- cgit v1.1 From 75b9f5d2a0318da9d5e694a9a1be33f46b4c021e Mon Sep 17 00:00:00 2001 From: "mingo@elte.hu" <mingo@elte.hu> Date: Thu, 5 Jun 2008 11:18:12 +0200 Subject: x86, nmi: fix build fix: arch/x86/kernel/built-in.o: In function `proc_nmi_enabled': : undefined reference to `nmi_watchdog_default' arch/x86/kernel/built-in.o: In function `native_smp_prepare_cpus': : undefined reference to `nmi_watchdog_default' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 27ca8f6..19f1b95 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -88,7 +88,6 @@ static inline unsigned int get_timer_irqs(int cpu) #endif } -#ifdef CONFIG_X86_64 /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -96,7 +95,6 @@ void nmi_watchdog_default(void) return; nmi_watchdog = NMI_NONE; } -#endif #ifdef CONFIG_SMP /* -- cgit v1.1 From ea57a5a6db8961de35cd1a4a80d8e01ee4307973 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 6 Jun 2008 16:28:23 +0200 Subject: x86, 32-bit: SRAT fix we were adding reserved BIOS ranges as general memory as well => not good. solves this crash: [ 20.068075] hostname used greatest stack depth: 6464 bytes left [ 20.121404] BUG: unable to handle kernel <1>BUG: unable to handle kernel NULL pointer dereference at 00000b8c [ 20.121404] IP: [<c01160ae>] kmap_atomic_prot+0x2d/0x1c3 [ 20.121404] *pdpt = 00000000367eb001 *pde = 0000000000000000 [ 20.121404] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 20.121404] [ 20.121404] Pid: 2061, comm: rc.sysinit Not tainted (2.6.26-rc3 #2440) [ 20.121404] EIP: 0060:[<c01160ae>] EFLAGS: 00010206 CPU: 0 [ 20.121404] EIP is at kmap_atomic_prot+0x2d/0x1c3 Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/srat_32.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 32d8b11..5eedd3f 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -240,6 +240,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) for (i = 0; i < MAX_APICID; i++) apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + remove_all_active_ranges(); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", @@ -247,14 +248,8 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) node_read_chunk(chunk->nid, chunk); e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); - } - - for_each_online_node(nid) { - unsigned long start = node_start_pfn[nid]; - unsigned long end = node_end_pfn[nid]; - - memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); + memory_present(chunk->nid, chunk->start_pfn, + min(chunk->end_pfn, max_pfn)); } return 1; out_fail: -- cgit v1.1 From 237749428552e2f81ec6c801482dfd8a777e470b Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 9 Jun 2008 10:57:16 +0200 Subject: Revert "x86, 32-bit: SRAT fix" This reverts commit ea57a5a6db8961de35cd1a4a80d8e01ee4307973, a better fix will be merged. --- arch/x86/kernel/srat_32.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 5eedd3f..32d8b11 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -240,7 +240,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) for (i = 0; i < MAX_APICID; i++) apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); - remove_all_active_ranges(); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", @@ -248,8 +247,14 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) node_read_chunk(chunk->nid, chunk); e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); - memory_present(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); + } + + for_each_online_node(nid) { + unsigned long start = node_start_pfn[nid]; + unsigned long end = node_end_pfn[nid]; + + memory_present(nid, start, end); + node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); } return 1; out_fail: -- cgit v1.1 From db3660c1905293b91653e07f7857576df71ebf28 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 3 Jun 2008 01:43:24 -0700 Subject: x86: remove all active memory ranges before registering them again after trimming - 64bit this way we keep the early_node_map all right. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index adf3b04..26d60cc 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -368,6 +368,7 @@ void __init setup_arch(char **cmdline_p) /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(end_pfn)) { + remove_all_active_ranges(); e820_register_active_regions(0, 0, -1UL); end_pfn = e820_end_of_ram(); } -- cgit v1.1 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 2 +- include/linux/mm.h | 3 +-- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3..489605b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd..7cbd949f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c..2154086 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit v1.1 From 9043f007963f4039befa3c31f47173f74a0b1c70 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 6 Jun 2008 18:53:33 -0700 Subject: x86, numa, 32-bit: use find_e820_area() to find KVA RAM on node don't assume we can use RAM near the end of every node. Esp systems that have few memory and they could have kva address and kva RAM all below max_low_pfn. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 489605b..accc7c6 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -228,17 +228,21 @@ static unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; - unsigned long pfn; for_each_online_node(nid) { - unsigned old_end_pfn = node_end_pfn[nid]; + u64 node_end_target; + u64 node_end_final; /* * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ + printk("node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; + if (!node_end_pfn[nid]) + continue; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -250,37 +254,40 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; + node_end_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_end_target <<= PAGE_SHIFT; + do { + node_end_final = find_e820_area(node_end_target, + ((u64)node_end_pfn[nid])<<PAGE_SHIFT, + ((u64)size)<<PAGE_SHIFT, + LARGE_PAGE_BYTES); + node_end_target -= LARGE_PAGE_BYTES; + } while (node_end_final == -1ULL && + (node_end_target>>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_end_final == -1ULL) + panic("Can not get kva ram\n"); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } + printk("Shrinking node %d from %ld pages to %lld pages\n", + nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + */ + if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) + reserve_early(node_end_final, + node_end_final+(((u64)size)<<PAGE_SHIFT), + "KVA RAM"); - node_end_pfn[nid] -= size; + node_end_pfn[nid] = node_end_final>>PAGE_SHIFT; node_remap_start_pfn[nid] = node_end_pfn[nid]; shrink_active_range(nid, node_end_pfn[nid]); } -- cgit v1.1 From c3ff01672a23fabb40d4b80ff25a845582fd07c2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 6 Jun 2008 18:54:26 -0700 Subject: x86: fix boot failure with 64GB+ system with numa 32-bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/srat_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 32d8b11..e9d9172 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -251,7 +251,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; - unsigned long end = node_end_pfn[nid]; + unsigned long end = min(node_end_pfn[nid], max_pfn); memory_present(nid, start, end); node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); -- cgit v1.1 From e0da33646826b66ef933d47ea2fb7a693fd849bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 8 Jun 2008 18:29:22 -0700 Subject: x86: introduce max_physical_apicid for bigsmp switching a multi-socket test-system with 3 or 4 ioapics, when 4 dualcore cpus or 2 quadcore cpus installed, needs to switch to bigsmp or physflat. CPU apic id is [4,11] instead of [0,7], and we need to check max apic id instead of cpu numbers. also add check for 32 bit when acpi is not compiled in or acpi=off. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 5 ++++- arch/x86/kernel/apic_64.c | 3 +++ arch/x86/kernel/genapic_64.c | 2 +- arch/x86/kernel/mpparse.c | 5 +++++ arch/x86/kernel/setup.c | 1 + arch/x86/kernel/setup_32.c | 11 +++++------ arch/x86/mach-generic/bigsmp.c | 2 +- include/asm-x86/mpspec.h | 1 + 8 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index c304759..954d679 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1518,6 +1518,9 @@ void __cpuinit generic_processor_info(int apicid, int version) */ cpu = 0; + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + /* * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y * but we need to work other dependencies like SMP_SUSPEND etc @@ -1525,7 +1528,7 @@ void __cpuinit generic_processor_info(int apicid, int version) * if (CPU_HOTPLUG_ENABLED || num_processors > 8) * - Ashok Raj <ashok.raj@intel.com> */ - if (num_processors > 8) { + if (max_physical_apicid >= 8) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (!APIC_XAPIC(version)) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 54087f9..1872555 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1093,6 +1093,9 @@ void __cpuinit generic_processor_info(int apicid, int version) */ cpu = 0; } + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + /* are we being called early in kernel startup? */ if (x86_cpu_to_apicid_early_ptr) { u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index cbaaf69..1fa8be5 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -51,7 +51,7 @@ void __init setup_apic_routing(void) else #endif - if (num_possible_cpus() <= 8) + if (max_physical_apicid < 8) genapic = &apic_flat; else genapic = &apic_physflat; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b4a950d..7591325 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -473,6 +473,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) ++mpc_record; #endif } + +#ifdef CONFIG_X86_GENERICARCH + generic_bigsmp_probe(); +#endif + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a281f2..45a5e24 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -17,6 +17,7 @@ unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; +unsigned int max_physical_apicid; EXPORT_SYMBOL(boot_cpu_physical_apicid); DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index ee1ccdb..be1a990 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -838,18 +838,17 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_ACPI acpi_boot_init(); - +#endif +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + if (smp_found_config) + get_smp_config(); +#endif #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) if (def_to_bigsmp) printk(KERN_WARNING "More than 8 CPUs detected and " "CONFIG_X86_PC cannot handle it.\nUse " "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); #endif -#endif -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) - if (smp_found_config) - get_smp_config(); -#endif e820_setup_gap(); e820_mark_nosave_regions(max_low_pfn); diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 95fc463..2a24301 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -48,7 +48,7 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { static int probe_bigsmp(void) { if (def_to_bigsmp) - dmi_bigsmp = 1; + dmi_bigsmp = 1; else dmi_check_system(bigsmp_dmi_table); return dmi_bigsmp; diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index f38b68e..4451d72 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -33,6 +33,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern unsigned int max_physical_apicid; extern int smp_found_config; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -- cgit v1.1 From 4e78c91abe1a40b905611100a593be62784ba355 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 9 Jun 2008 11:59:30 +0200 Subject: Revert "x86, numaq: add pci_acpi_scan_root() stub" This reverts commit f3294690979634ee10398bb0beadfe1d4edb881d. That bug will be fixed in a better way via: x86: make generic arch support NUMAQ --- arch/x86/kernel/numaq_32.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 992f53c..e65281b 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -87,14 +87,3 @@ static int __init numaq_tsc_disable(void) return 0; } arch_initcall(numaq_tsc_disable); - -#ifdef CONFIG_ACPI -/* - * Dummy implementation: - */ -struct pci_bus * __devinit -pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) -{ - return NULL; -} -#endif -- cgit v1.1 From d49c4288407b2ffa8cab270cb5bc6882abe969f6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 8 Jun 2008 18:31:54 -0700 Subject: x86: make generic arch support NUMAQ ... so it could fall back to normal numa and we'd reduce the impact of the NUMAQ subarch. NUMAQ depends on GENERICARCH also decouple genericarch numa from acpi. also make it fall back to bigsmp if apicid > 8. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 74 ++++++++++++++--------------- arch/x86/Makefile | 18 ------- arch/x86/boot/compressed/misc.c | 4 -- arch/x86/kernel/acpi/boot.c | 4 +- arch/x86/kernel/io_apic_32.c | 9 ++-- arch/x86/kernel/mpparse.c | 73 ++++++++++++++++++++++++++-- arch/x86/kernel/numaq_32.c | 2 - arch/x86/kernel/summit_32.c | 2 + arch/x86/mach-es7000/Makefile | 1 - arch/x86/mach-es7000/es7000plat.c | 47 ------------------ arch/x86/mach-generic/Makefile | 10 ++-- arch/x86/mach-generic/bigsmp.c | 2 - arch/x86/mach-generic/numaq.c | 41 ++++++++++++++++ arch/x86/mach-generic/probe.c | 15 +++++- arch/x86/pci/Makefile_32 | 5 +- arch/x86/pci/numa.c | 29 ++--------- drivers/acpi/Kconfig | 1 - include/asm-x86/mach-generic/mach_mpparse.h | 7 ++- include/asm-x86/mach-numaq/mach_apic.h | 39 ++++----------- include/asm-x86/mach-numaq/mach_mpparse.h | 11 +---- include/asm-x86/mmzone_32.h | 18 ++----- include/asm-x86/mpspec.h | 6 +++ include/asm-x86/numaq.h | 5 +- include/asm-x86/srat.h | 12 +++-- 24 files changed, 219 insertions(+), 216 deletions(-) create mode 100644 arch/x86/mach-generic/numaq.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9e97615..8b89810 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -268,36 +268,6 @@ config X86_VOYAGER If you do not specifically know you have a Voyager based machine, say N here, otherwise the kernel you build will not be bootable. -config X86_NUMAQ - bool "NUMAQ (IBM/Sequent)" - depends on SMP && X86_32 - select NUMA - help - This option is used for getting Linux to run on a (IBM/Sequent) NUMA - multiquad box. This changes the way that processors are bootstrapped, - and uses Clustered Logical APIC addressing mode instead of Flat Logical. - You will need a new lynxer.elf file to flash your firmware with - send - email to <Martin.Bligh@us.ibm.com>. - -config X86_SUMMIT - bool "Summit/EXA (IBM x440)" - depends on X86_32 && SMP - help - This option is needed for IBM systems that use the Summit/EXA chipset. - In particular, it is needed for the x440. - - If you don't have one of these computers, you should say N here. - If you want to build a NUMA kernel, you must select ACPI. - -config X86_BIGSMP - bool "Support for other sub-arch SMP systems with more than 8 CPUs" - depends on X86_32 && SMP - help - This option is needed for the systems that have more than 8 CPUs - and if the system is not of any sub-arch type above. - - If you don't have such a system, you should say N here. - config X86_VISWS bool "SGI 320/540 (Visual Workstation)" depends on X86_32 @@ -311,12 +281,33 @@ config X86_VISWS and vice versa. See <file:Documentation/sgi-visws.txt> for details. config X86_GENERICARCH - bool "Generic architecture (Summit, bigsmp, ES7000, default)" + bool "Generic architecture" depends on X86_32 help - This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. - It is intended for a generic binary kernel. - If you want a NUMA kernel, select ACPI. We need SRAT for NUMA. + This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default + subarchitectures. It is intended for a generic binary kernel. + if you select them all, kernel will probe it one by one. and will + fallback to default. + +if X86_GENERICARCH + +config X86_NUMAQ + bool "NUMAQ (IBM/Sequent)" + depends on SMP && X86_32 + select NUMA + help + This option is used for getting Linux to run on a NUMAQ (IBM/Sequent) + NUMA multiquad box. This changes the way that processors are + bootstrapped, and uses Clustered Logical APIC addressing mode instead + of Flat Logical. You will need a new lynxer.elf file to flash your + firmware with - send email to <Martin.Bligh@us.ibm.com>. + +config X86_SUMMIT + bool "Summit/EXA (IBM x440)" + depends on X86_32 && SMP + help + This option is needed for IBM systems that use the Summit/EXA chipset. + In particular, it is needed for the x440. config X86_ES7000 bool "Support for Unisys ES7000 IA32 series" @@ -324,8 +315,15 @@ config X86_ES7000 help Support for Unisys ES7000 systems. Say 'Y' here if this kernel is supposed to run on an IA32-based Unisys ES7000 system. - Only choose this option if you have such a system, otherwise you - should say N here. + +config X86_BIGSMP + bool "Support for big SMP systems with more than 8 CPUs" + depends on X86_32 && SMP + help + This option is needed for the systems that have more than 8 CPUs + and if the system is not of any sub-arch type above. + +endif config X86_RDC321X bool "RDC R-321x SoC" @@ -913,9 +911,9 @@ config X86_PAE config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC - default y if (X86_NUMAQ || X86_SUMMIT) + default y if (X86_NUMAQ || X86_SUMMIT || X86_GENERICARCH) help Enable NUMA (Non Uniform Memory Access) support. The kernel will try to allocate memory used by a CPU on the diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 30b046b..d665013 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -117,29 +117,11 @@ mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/ -# NUMAQ subarch support -mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq -mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/ - -# BIGSMP subarch support -mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp -mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/ - -#Summit subarch support -mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit -mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/ - # generic subarchitecture mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ - -# ES7000 subarch support -mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000 -fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ -mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/ - # RDC R-321x subarch support mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 90456ce..ba0be6a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -221,10 +221,6 @@ static char *vidmem; static int vidport; static int lines, cols; -#ifdef CONFIG_X86_NUMAQ -void *xquad_portio; -#endif - #include "../../../../lib/inflate.c" static void *malloc(int size) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f226bdc..7dc2130 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -848,7 +848,7 @@ static int __init acpi_parse_madt_lapic_entries(void) #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +#ifdef CONFIG_X86_ES7000 extern int es7000_plat; #endif @@ -997,7 +997,7 @@ void __init mp_config_acpi_legacy_irqs(void) set_bit(MP_ISA_BUS, mp_bus_not_pci); Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); -#if defined(CONFIG_X86_ES7000) || defined(CONFIG_X86_GENERICARCH) +#ifdef CONFIG_X86_ES7000 /* * Older generations of ES7000 have no legacy identity mappings */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 2285b81..97e62a0 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1722,7 +1722,6 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -#ifndef CONFIG_X86_NUMAQ static void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; @@ -1732,6 +1731,11 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; +#ifdef CONFIG_X86_NUMAQ + if (found_numaq) + return; +#endif + /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -1828,9 +1832,6 @@ static void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } -#else -static void __init setup_ioapic_ids_from_mpc(void) { } -#endif int no_timer_check __initdata; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 7591325..1cc7a4b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -49,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len) } #ifdef CONFIG_X86_NUMAQ +int found_numaq; /* * Have to match translation table entries to main table entries by counter * hence the mpc_record variable .... can't see a less disgusting way of * doing this .... */ +struct mpc_config_translation { + unsigned char mpc_type; + unsigned char trans_len; + unsigned char trans_type; + unsigned char trans_quad; + unsigned char trans_global; + unsigned char trans_local; + unsigned short trans_reserved; +}; + static int mpc_record; static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata; + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + + +static inline int mpc_apic_id(struct mpc_config_processor *m, + struct mpc_config_translation *translation_record) +{ + int quad = translation_record->trans_quad; + int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); + + printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver, quad, logical_apicid); + return logical_apicid; +} + +int mp_bus_id_to_node[MAX_MP_BUSSES]; + +int mp_bus_id_to_local[MAX_MP_BUSSES]; + +static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, + struct mpc_config_translation *translation) +{ + int quad = translation->trans_quad; + int local = translation->trans_local; + + mp_bus_id_to_node[m->mpc_busid] = quad; + mp_bus_id_to_local[m->mpc_busid] = local; + printk(KERN_INFO "Bus #%d is %s (node %d)\n", + m->mpc_busid, name, quad); +} + +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +static void mpc_oem_pci_bus(struct mpc_config_bus *m, + struct mpc_config_translation *translation) +{ + int quad = translation->trans_quad; + int local = translation->trans_local; + + quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; +} + #endif static void __cpuinit MP_processor_info(struct mpc_config_processor *m) @@ -321,11 +379,11 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, } } -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, +void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) { if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! May not be a NUMA-Q system!\n"); + printk("Warning! Not a NUMA-Q system!\n"); else found_numaq = 1; @@ -388,7 +446,16 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) return 0; #ifdef CONFIG_X86_32 - mps_oem_check(mpc, oem, str); + /* + * need to make sure summit and es7000's mps_oem_check is safe to be + * called early via genericarch 's mps_oem_check + */ + if (early) { +#ifdef CONFIG_X86_NUMAQ + numaq_mps_oem_check(mpc, oem, str); +#endif + } else + mps_oem_check(mpc, oem, str); #endif /* save the local APIC address, it might be non-default */ diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 27b9082..f0f1de1 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -36,8 +36,6 @@ #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) -int found_numaq; - /* * Function: smp_dump_qct() * diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index ae75109..d67ce5f 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata; static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; +#ifndef CONFIG_X86_NUMAQ static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; +#endif static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) { diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile index 69dd4da..3ef8b43 100644 --- a/arch/x86/mach-es7000/Makefile +++ b/arch/x86/mach-es7000/Makefile @@ -3,4 +3,3 @@ # obj-$(CONFIG_X86_ES7000) := es7000plat.o -obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c index a41c77a..4354ce8 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/mach-es7000/es7000plat.c @@ -177,53 +177,6 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr) } #endif -/* - * This file also gets compiled if CONFIG_X86_GENERICARCH is set. Generic - * arch already has got following function definitions (asm-generic/es7000.c) - * hence no need to define these for that case. - */ -#ifndef CONFIG_X86_GENERICARCH -void es7000_sw_apic(void); -void __init enable_apic_mode(void) -{ - es7000_sw_apic(); - return; -} - -__init int mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (mpc->mpc_oemptr) { - struct mp_config_oemtable *oem_table = - (struct mp_config_oemtable *)mpc->mpc_oemptr; - if (!strncmp(oem, "UNISYS", 6)) - return parse_unisys_oem((char *)oem_table); - } - return 0; -} -#ifdef CONFIG_ACPI -/* Hook from generic ACPI tables.c */ -int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - unsigned long oem_addr; - if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (es7000_check_dsdt()) - return parse_unisys_oem((char *)oem_addr); - else { - setup_unisys(); - return 1; - } - } - return 0; -} -#else -int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 0; -} -#endif -#endif /* COFIG_X86_GENERICARCH */ - static void es7000_spin(int n) { diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile index 19d6d40..0dbd780 100644 --- a/arch/x86/mach-generic/Makefile +++ b/arch/x86/mach-generic/Makefile @@ -2,7 +2,11 @@ # Makefile for the generic architecture # -EXTRA_CFLAGS := -Iarch/x86/kernel +EXTRA_CFLAGS := -Iarch/x86/kernel -obj-y := probe.o summit.o bigsmp.o es7000.o default.o -obj-y += ../../x86/mach-es7000/ +obj-y := probe.o default.o +obj-$(CONFIG_X86_NUMAQ) += numaq.o +obj-$(CONFIG_X86_SUMMIT) += summit.o +obj-$(CONFIG_X86_BIGSMP) += bigsmp.o +obj-$(CONFIG_X86_ES7000) += es7000.o +obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/ diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 2a24301..59d7717 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -23,10 +23,8 @@ static int dmi_bigsmp; /* can be set by dmi scanners */ static int hp_ht_bigsmp(const struct dmi_system_id *d) { -#ifdef CONFIG_X86_GENERICARCH printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); dmi_bigsmp = 1; -#endif return 0; } diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c new file mode 100644 index 0000000..8091e68 --- /dev/null +++ b/arch/x86/mach-generic/numaq.c @@ -0,0 +1,41 @@ +/* + * APIC driver for the IBM NUMAQ chipset. + */ +#define APIC_DEFINITION 1 +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/smp.h> +#include <asm/mpspec.h> +#include <asm/genapic.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <asm/mach-numaq/mach_apic.h> +#include <asm/mach-numaq/mach_apicdef.h> +#include <asm/mach-numaq/mach_ipi.h> +#include <asm/mach-numaq/mach_mpparse.h> +#include <asm/mach-numaq/mach_wakecpu.h> +#include <asm/numaq.h> + +static int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + numaq_mps_oem_check(mpc, oem, productid); + return found_numaq; +} + +static int probe_numaq(void) +{ + /* already know from get_memcfg_numaq() */ + return found_numaq; +} + +/* Hook from generic ACPI tables.c */ +static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 0; +} + +struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index c5ae751..ba18dec 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -16,6 +16,7 @@ #include <asm/apicdef.h> #include <asm/genapic.h> +extern struct genapic apic_numaq; extern struct genapic apic_summit; extern struct genapic apic_bigsmp; extern struct genapic apic_es7000; @@ -24,9 +25,18 @@ extern struct genapic apic_default; struct genapic *genapic = &apic_default; static struct genapic *apic_probe[] __initdata = { +#ifdef CONFIG_X86_NUMAQ + &apic_numaq, +#endif +#ifdef CONFIG_X86_SUMMIT &apic_summit, +#endif +#ifdef CONFIG_X86_BIGSMP &apic_bigsmp, +#endif +#ifdef CONFIG_X86_ES7000 &apic_es7000, +#endif &apic_default, /* must be last */ NULL, }; @@ -54,6 +64,7 @@ early_param("apic", parse_apic); void __init generic_bigsmp_probe(void) { +#if CONFIG_X86_BIGSMP /* * This routine is used to switch to bigsmp mode when * - There is no apic= option specified by the user @@ -67,6 +78,7 @@ void __init generic_bigsmp_probe(void) printk(KERN_INFO "Overriding APIC driver with %s\n", genapic->name); } +#endif } void __init generic_apic_probe(void) @@ -88,7 +100,8 @@ void __init generic_apic_probe(void) /* These functions can switch the APIC even after the initial ->probe() */ -int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) +int __init mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) { int i; for (i = 0; apic_probe[i]; ++i) { diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32 index 89ec35d..962d96c 100644 --- a/arch/x86/pci/Makefile_32 +++ b/arch/x86/pci/Makefile_32 @@ -13,10 +13,11 @@ pci-y := fixup.o pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o -# Careful: VISWS and NUMAQ overrule the pci-y above. The colons are +# Careful: VISWS overrule the pci-y above. The colons are # therefor correct. This needs a proper fix by distangling the code. pci-$(CONFIG_X86_VISWS) := visws.o fixup.o -pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o + +pci-$(CONFIG_X86_NUMAQ) += numa.o # Necessary for NUMAQ as well pci-$(CONFIG_NUMA) += mp_bus_to_node.o diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c index d9afbae..99f1ecd 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numa.c @@ -6,45 +6,21 @@ #include <linux/init.h> #include <linux/nodemask.h> #include <mach_apic.h> +#include <asm/mpspec.h> #include "pci.h" #define XQUAD_PORTIO_BASE 0xfe400000 #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ -int mp_bus_id_to_node[MAX_MP_BUSSES]; #define BUS2QUAD(global) (mp_bus_id_to_node[global]) -int mp_bus_id_to_local[MAX_MP_BUSSES]; #define BUS2LOCAL(global) (mp_bus_id_to_local[global]) -void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - mp_bus_id_to_node[m->mpc_busid] = quad; - mp_bus_id_to_local[m->mpc_busid] = local; - printk(KERN_INFO "Bus #%d is %s (node %d)\n", - m->mpc_busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) -void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; -} /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -#ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); -#endif #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) @@ -179,6 +155,9 @@ static int __init pci_numa_init(void) { int quad; + if (!found_numaq) + return 0; + raw_pci_ops = &pci_direct_conf1_mq; if (pcibios_scanned++) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index c52fca8..860f15f 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -4,7 +4,6 @@ menuconfig ACPI bool "ACPI (Advanced Configuration and Power Interface) Support" - depends on !X86_NUMAQ depends on !X86_VISWS depends on !IA64_HP_SIM depends on IA64 || X86 diff --git a/include/asm-x86/mach-generic/mach_mpparse.h b/include/asm-x86/mach-generic/mach_mpparse.h index 0d0b5ba..586cadb 100644 --- a/include/asm-x86/mach-generic/mach_mpparse.h +++ b/include/asm-x86/mach-generic/mach_mpparse.h @@ -1,7 +1,10 @@ #ifndef _MACH_MPPARSE_H #define _MACH_MPPARSE_H 1 -int mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid); -int acpi_madt_oem_check(char *oem_id, char *oem_table_id); + +extern int mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); + +extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id); #endif diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h index 75a56e5..d802465 100644 --- a/include/asm-x86/mach-numaq/mach_apic.h +++ b/include/asm-x86/mach-numaq/mach_apic.h @@ -20,8 +20,14 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ -#define check_apicid_used(bitmap, apicid) physid_isset(apicid, bitmap) -#define check_apicid_present(bit) physid_isset(bit, phys_cpu_present_map) +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return physid_isset(apicid, bitmap); +} +static inline unsigned long check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} #define apicid_cluster(apicid) (apicid & 0xF0) static inline int apic_id_registered(void) @@ -77,11 +83,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - static inline int apicid_to_node(int logical_apicid) { return logical_apicid >> 4; @@ -95,30 +96,6 @@ static inline physid_mask_t apicid_to_cpu_present(int logical_apicid) return physid_mask_of_physid(cpu + 4*node); } -struct mpc_config_translation { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - -static inline int mpc_apic_id(struct mpc_config_processor *m, - struct mpc_config_translation *translation_record) -{ - int quad = translation_record->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); - - printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver, quad, logical_apicid); - return logical_apicid; -} - extern void *xquad_portio; static inline void setup_portio_remap(void) diff --git a/include/asm-x86/mach-numaq/mach_mpparse.h b/include/asm-x86/mach-numaq/mach_mpparse.h index 459b124..626aef6 100644 --- a/include/asm-x86/mach-numaq/mach_mpparse.h +++ b/include/asm-x86/mach-numaq/mach_mpparse.h @@ -1,14 +1,7 @@ #ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H -extern void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation); -extern void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation); - -/* Hook from generic ACPI tables.c */ -static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ -} +extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid); #endif /* __ASM_MACH_MPPARSE_H */ diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index ab00128..b2298a2 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -12,11 +12,9 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) -#ifdef CONFIG_X86_NUMAQ - #include <asm/numaq.h> -#elif defined(CONFIG_ACPI_SRAT)/* summit or generic arch */ - #include <asm/srat.h> -#endif +#include <asm/numaq.h> +/* summit or generic arch */ +#include <asm/srat.h> extern int get_memcfg_numa_flat(void); /* @@ -26,14 +24,11 @@ extern int get_memcfg_numa_flat(void); */ static inline void get_memcfg_numa(void) { -#ifdef CONFIG_X86_NUMAQ + if (get_memcfg_numaq()) return; -#elif defined(CONFIG_ACPI_SRAT) if (get_memcfg_from_srat()) return; -#endif - get_memcfg_numa_flat(); } @@ -42,7 +37,6 @@ extern int early_pfn_to_nid(unsigned long pfn); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat -#define get_zholes_size(n) (0) #endif /* CONFIG_NUMA */ @@ -83,9 +77,6 @@ static inline int pfn_to_nid(unsigned long pfn) __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#ifdef CONFIG_X86_NUMAQ /* we have contiguous memory on NUMA-Q */ -#define pfn_valid(pfn) ((pfn) < num_physpages) -#else static inline int pfn_valid(int pfn) { int nid = pfn_to_nid(pfn); @@ -94,7 +85,6 @@ static inline int pfn_valid(int pfn) return (pfn < node_end_pfn(nid)); return 0; } -#endif /* CONFIG_X86_NUMAQ */ #endif /* CONFIG_DISCONTIGMEM */ diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 4451d72..6e9c958 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -13,6 +13,12 @@ extern int apic_version[MAX_APICS]; extern u8 apicid_2_node[]; extern int pic_mode; +#ifdef CONFIG_X86_NUMAQ +extern int mp_bus_id_to_node[MAX_MP_BUSSES]; +extern int mp_bus_id_to_local[MAX_MP_BUSSES]; +extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +#endif + #define MAX_APICID 256 #else diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h index 739d164..ef068d24 100644 --- a/include/asm-x86/numaq.h +++ b/include/asm-x86/numaq.h @@ -157,9 +157,10 @@ struct sys_cfg_data { struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */ }; -static inline unsigned long *get_zholes_size(int nid) +#else +static inline int get_memcfg_numaq(void) { - return NULL; + return 0; } #endif /* CONFIG_X86_NUMAQ */ #endif /* NUMAQ_H */ diff --git a/include/asm-x86/srat.h b/include/asm-x86/srat.h index f4bba13..456fe0b 100644 --- a/include/asm-x86/srat.h +++ b/include/asm-x86/srat.h @@ -27,11 +27,13 @@ #ifndef _ASM_SRAT_H_ #define _ASM_SRAT_H_ -#ifndef CONFIG_ACPI_SRAT -#error CONFIG_ACPI_SRAT not defined, and srat.h header has been included -#endif - +#ifdef CONFIG_ACPI_SRAT extern int get_memcfg_from_srat(void); -extern unsigned long *get_zholes_size(int); +#else +static inline int get_memcfg_from_srat(void) +{ + return 0; +} +#endif #endif /* _ASM_SRAT_H_ */ -- cgit v1.1 From 6780711e9817585f89b02b0556aac3564cbe1f45 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 8 Jun 2008 19:53:26 -0700 Subject: x86: update mptable, fix need to call early_reserve_e820() to preallocate mptable for 32bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index be1a990..4fa9f6c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -749,6 +749,8 @@ void __init setup_arch(char **cmdline_p) */ max_pfn = e820_end_of_ram(); + /* preallocate 4k for mptable mpc */ + early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) { -- cgit v1.1 From af1cf204ba2fd8135933a2e4df523fb1112dc0e2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sun, 25 May 2008 21:16:06 +0200 Subject: x86, mpparse: build fix fix: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `early_reserve_e820_mpc_new' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mpspec.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 6e9c958..b8ba374 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -46,7 +46,11 @@ extern unsigned long mp_lapic_addr; extern void find_smp_config(void); extern void get_smp_config(void); +#ifdef CONFIG_X86_MPPARSE extern void early_reserve_e820_mpc_new(void); +#else +static inline void early_reserve_e820_mpc_new(void) { } +#endif void __cpuinit generic_processor_info(int apicid, int version); #ifdef CONFIG_ACPI -- cgit v1.1 From 668231141f307ffd81db075b34bddaedae0ec863 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Thu, 5 Jun 2008 16:35:10 +0200 Subject: x86: fix compile warning in io_apic_{32,64}.c Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 1 + arch/x86/kernel/io_apic_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54f..4fbf656 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1489,6 +1489,7 @@ void /*__init*/ print_local_APIC(void * dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); v = apic_read(APIC_LVR); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8df..c7c6b00 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1077,6 +1077,7 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); -- cgit v1.1 From ce8e37cdbdb34a9faeade22e0e6440f0d04560f5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Fri, 6 Jun 2008 10:21:39 +0100 Subject: x86: set PAE PHYSICAL_MASK_SHIFT to 44 bits. When a 64-bit x86 processor runs in 32-bit PAE mode, a pte can potentially have the same number of physical address bits as the 64-bit host ("Enhanced Legacy PAE Paging"). This means, in theory, we could have up to 52 bits of physical address in a pte. The 32-bit kernel uses a 32-bit unsigned long to represent a pfn. This means that it can only represent physical addresses up to 32+12=44 bits wide. Rather than widening pfns everywhere, just set 2^44 as the Linux x86_32-PAE architectural limit for physical address size. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/page_32.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 424e82f..ccf0ba3 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -14,7 +14,8 @@ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) #ifdef CONFIG_X86_PAE -#define __PHYSICAL_MASK_SHIFT 36 +/* 44=32+12, the limit we can fit into an unsigned long pfn */ +#define __PHYSICAL_MASK_SHIFT 44 #define __VIRTUAL_MASK_SHIFT 32 #define PAGETABLE_LEVELS 3 -- cgit v1.1 From 9e26d84273541a8c6c2efb705457ca8e6245fb73 Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Fri, 6 Jun 2008 12:01:13 +0200 Subject: fix build bug in "x86: add PCI extended config space access for AMD Barcelona" Also much less code now. Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/amd_64.c | 1 + arch/x86/kernel/cpu/cpu.h | 5 +++++ arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/setup.h | 26 -------------------------- arch/x86/kernel/setup_64.c | 2 -- include/asm-x86/mmconfig.h | 6 ------ 7 files changed, 7 insertions(+), 37 deletions(-) delete mode 100644 arch/x86/kernel/setup.h diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 656b40a..a38d54f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -4,10 +4,8 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> -#include <asm/mmconfig.h> #include <mach_apic.h> -#include "../setup.h" #include "cpu.h" /* diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 180097e..626fc21 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -6,6 +6,7 @@ #include <asm/cacheflush.h> #include <mach_apic.h> +#include "cpu.h" extern int __cpuinit get_model_name(struct cpuinfo_x86 *c); extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 783691b..f5d5bb1 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,3 +1,4 @@ +#ifdef CONFIG_X86_32 struct cpu_model_info { int vendor; @@ -36,3 +37,7 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); + +#endif /* CONFIG_X86_32 */ + +extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d8f1712..20e14db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -136,6 +136,7 @@ void __init setup_per_cpu_areas(void) setup_cpumask_of_cpu(); } +#endif #define ENABLE_CF8_EXT_CFG (1ULL << 46) void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) @@ -149,4 +150,3 @@ void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG); } -#endif diff --git a/arch/x86/kernel/setup.h b/arch/x86/kernel/setup.h deleted file mode 100644 index 66cc2c7..0000000 --- a/arch/x86/kernel/setup.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Internal declarations for shared x86 setup code. - * - * Copyright (c) 2008 Advanced Micro Devices, Inc. - * Contributed by Robert Richter <robert.richter@amd.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - * 02111-1307 USA - */ - -#ifndef _ARCH_X86_KERNEL_SETUP_H - -extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); - -#endif /* _ARCH_X86_KERNEL_SETUP_H */ diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 215bd67..25afdd8 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -73,8 +73,6 @@ #include <asm/pat.h> #include <asm/mmconfig.h> -#include "setup.h" - #include <mach_apic.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h index 691798f..95beda0 100644 --- a/include/asm-x86/mmconfig.h +++ b/include/asm-x86/mmconfig.h @@ -9,10 +9,4 @@ static inline void fam10h_check_enable_mmcfg(void) { } static inline void check_enable_amd_mmconf_dmi(void) { } #endif -#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) -extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); -#else -static inline void amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) { } -#endif - #endif -- cgit v1.1 From 7058b06188232381c13ff05a3caf458bb892b834 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 7 Jun 2008 14:14:35 +0200 Subject: x86: coding style fixes to arch/x86/pci/irq. Before: total: 60 errors, 85 warnings, 1237 lines checked After: total: 1 errors, 82 warnings, 1226 lines checked WARNING: line over 80 characters Compile tested. paolo@paolo-desktop:/tmp$ size irq.o.* text data bss dec hex filename 6128 440 76 6644 19f4 irq.o.after 6128 440 76 6644 19f4 irq.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/pci/irq.c | 261 +++++++++++++++++++++++++---------------------------- 1 file changed, 125 insertions(+), 136 deletions(-) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0908fca..7300567 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -11,8 +11,8 @@ #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/dmi.h> -#include <asm/io.h> -#include <asm/smp.h> +#include <linux/io.h> +#include <linux/smp.h> #include <asm/io_apic.h> #include <linux/irq.h> #include <linux/acpi.h> @@ -61,7 +61,7 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; * and perform checksum verification. */ -static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) +static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr) { struct irq_routing_table *rt; int i; @@ -74,7 +74,7 @@ static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) rt->size < sizeof(struct irq_routing_table)) return NULL; sum = 0; - for (i=0; i < rt->size; i++) + for (i = 0; i < rt->size; i++) sum += addr[i]; if (!sum) { DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); @@ -100,7 +100,7 @@ static struct irq_routing_table * __init pirq_find_routing_table(void) return rt; printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); } - for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { + for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { rt = pirq_check_routing_table(addr); if (rt) return rt; @@ -122,20 +122,20 @@ static void __init pirq_peer_trick(void) struct irq_info *e; memset(busmap, 0, sizeof(busmap)); - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { + for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { e = &rt->slots[i]; #ifdef DEBUG { int j; DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); - for(j=0; j<4; j++) + for (j = 0; j < 4; j++) DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); DBG("\n"); } #endif busmap[e->bus] = 1; } - for(i = 1; i < 256; i++) { + for (i = 1; i < 256; i++) { int node; if (!busmap[i] || pci_find_bus(0, i)) continue; @@ -285,7 +285,7 @@ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; WARN_ON_ONCE(pirq > 4); - return read_config_nybble(router,0x43, pirqmap[pirq-1]); + return read_config_nybble(router, 0x43, pirqmap[pirq-1]); } static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) @@ -314,7 +314,7 @@ static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, /* * Cyrix: nibble offset 0x5C - * 0x5C bits 7:4 is INTB bits 3:0 is INTA + * 0x5C bits 7:4 is INTB bits 3:0 is INTA * 0x5D bits 7:4 is INTD bits 3:0 is INTC */ static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) @@ -350,7 +350,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, * Apparently there are systems implementing PCI routing table using * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. * We try our best to handle both link mappings. - * + * * Currently (2003-05-21) it appears most SiS chipsets follow the * definition of routing registers from the SiS-5595 southbridge. * According to the SiS 5595 datasheets the revision id's of the @@ -370,7 +370,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, * * 0x62: USBIRQ: * bit 6 OHCI function disabled (0), enabled (1) - * + * * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved * * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved @@ -487,9 +487,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq u8 irq; irq = 0; if (pirq <= 4) - { irq = read_config_nybble(router, 0x56, pirq - 1); - } printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", dev->vendor, dev->device, pirq, irq); return irq; @@ -497,12 +495,10 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", dev->vendor, dev->device, pirq, irq); if (pirq <= 4) - { write_config_nybble(router, 0x56, pirq - 1, irq); - } return 1; } @@ -549,50 +545,49 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route if (pci_dev_present(pirq_440gx)) return 0; - switch(device) - { - case PCI_DEVICE_ID_INTEL_82371FB_0: - case PCI_DEVICE_ID_INTEL_82371SB_0: - case PCI_DEVICE_ID_INTEL_82371AB_0: - case PCI_DEVICE_ID_INTEL_82371MX: - case PCI_DEVICE_ID_INTEL_82443MX_0: - case PCI_DEVICE_ID_INTEL_82801AA_0: - case PCI_DEVICE_ID_INTEL_82801AB_0: - case PCI_DEVICE_ID_INTEL_82801BA_0: - case PCI_DEVICE_ID_INTEL_82801BA_10: - case PCI_DEVICE_ID_INTEL_82801CA_0: - case PCI_DEVICE_ID_INTEL_82801CA_12: - case PCI_DEVICE_ID_INTEL_82801DB_0: - case PCI_DEVICE_ID_INTEL_82801E_0: - case PCI_DEVICE_ID_INTEL_82801EB_0: - case PCI_DEVICE_ID_INTEL_ESB_1: - case PCI_DEVICE_ID_INTEL_ICH6_0: - case PCI_DEVICE_ID_INTEL_ICH6_1: - case PCI_DEVICE_ID_INTEL_ICH7_0: - case PCI_DEVICE_ID_INTEL_ICH7_1: - case PCI_DEVICE_ID_INTEL_ICH7_30: - case PCI_DEVICE_ID_INTEL_ICH7_31: - case PCI_DEVICE_ID_INTEL_ESB2_0: - case PCI_DEVICE_ID_INTEL_ICH8_0: - case PCI_DEVICE_ID_INTEL_ICH8_1: - case PCI_DEVICE_ID_INTEL_ICH8_2: - case PCI_DEVICE_ID_INTEL_ICH8_3: - case PCI_DEVICE_ID_INTEL_ICH8_4: - case PCI_DEVICE_ID_INTEL_ICH9_0: - case PCI_DEVICE_ID_INTEL_ICH9_1: - case PCI_DEVICE_ID_INTEL_ICH9_2: - case PCI_DEVICE_ID_INTEL_ICH9_3: - case PCI_DEVICE_ID_INTEL_ICH9_4: - case PCI_DEVICE_ID_INTEL_ICH9_5: - case PCI_DEVICE_ID_INTEL_TOLAPAI_0: - case PCI_DEVICE_ID_INTEL_ICH10_0: - case PCI_DEVICE_ID_INTEL_ICH10_1: - case PCI_DEVICE_ID_INTEL_ICH10_2: - case PCI_DEVICE_ID_INTEL_ICH10_3: - r->name = "PIIX/ICH"; - r->get = pirq_piix_get; - r->set = pirq_piix_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_INTEL_82371FB_0: + case PCI_DEVICE_ID_INTEL_82371SB_0: + case PCI_DEVICE_ID_INTEL_82371AB_0: + case PCI_DEVICE_ID_INTEL_82371MX: + case PCI_DEVICE_ID_INTEL_82443MX_0: + case PCI_DEVICE_ID_INTEL_82801AA_0: + case PCI_DEVICE_ID_INTEL_82801AB_0: + case PCI_DEVICE_ID_INTEL_82801BA_0: + case PCI_DEVICE_ID_INTEL_82801BA_10: + case PCI_DEVICE_ID_INTEL_82801CA_0: + case PCI_DEVICE_ID_INTEL_82801CA_12: + case PCI_DEVICE_ID_INTEL_82801DB_0: + case PCI_DEVICE_ID_INTEL_82801E_0: + case PCI_DEVICE_ID_INTEL_82801EB_0: + case PCI_DEVICE_ID_INTEL_ESB_1: + case PCI_DEVICE_ID_INTEL_ICH6_0: + case PCI_DEVICE_ID_INTEL_ICH6_1: + case PCI_DEVICE_ID_INTEL_ICH7_0: + case PCI_DEVICE_ID_INTEL_ICH7_1: + case PCI_DEVICE_ID_INTEL_ICH7_30: + case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_ESB2_0: + case PCI_DEVICE_ID_INTEL_ICH8_0: + case PCI_DEVICE_ID_INTEL_ICH8_1: + case PCI_DEVICE_ID_INTEL_ICH8_2: + case PCI_DEVICE_ID_INTEL_ICH8_3: + case PCI_DEVICE_ID_INTEL_ICH8_4: + case PCI_DEVICE_ID_INTEL_ICH9_0: + case PCI_DEVICE_ID_INTEL_ICH9_1: + case PCI_DEVICE_ID_INTEL_ICH9_2: + case PCI_DEVICE_ID_INTEL_ICH9_3: + case PCI_DEVICE_ID_INTEL_ICH9_4: + case PCI_DEVICE_ID_INTEL_ICH9_5: + case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_ICH10_0: + case PCI_DEVICE_ID_INTEL_ICH10_1: + case PCI_DEVICE_ID_INTEL_ICH10_2: + case PCI_DEVICE_ID_INTEL_ICH10_3: + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; } return 0; } @@ -606,7 +601,7 @@ static __init int via_router_probe(struct irq_router *r, * workarounds for some buggy BIOSes */ if (device == PCI_DEVICE_ID_VIA_82C586_0) { - switch(router->device) { + switch (router->device) { case PCI_DEVICE_ID_VIA_82C686: /* * Asus k7m bios wrongly reports 82C686A @@ -624,7 +619,7 @@ static __init int via_router_probe(struct irq_router *r, } } - switch(device) { + switch (device) { case PCI_DEVICE_ID_VIA_82C586_0: r->name = "VIA"; r->get = pirq_via586_get; @@ -647,13 +642,12 @@ static __init int via_router_probe(struct irq_router *r, static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_VLSI_82C534: - r->name = "VLSI 82C534"; - r->get = pirq_vlsi_get; - r->set = pirq_vlsi_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_VLSI_82C534: + r->name = "VLSI 82C534"; + r->get = pirq_vlsi_get; + r->set = pirq_vlsi_set; + return 1; } return 0; } @@ -661,14 +655,13 @@ static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_SERVERWORKS_OSB4: - case PCI_DEVICE_ID_SERVERWORKS_CSB5: - r->name = "ServerWorks"; - r->get = pirq_serverworks_get; - r->set = pirq_serverworks_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_SERVERWORKS_OSB4: + case PCI_DEVICE_ID_SERVERWORKS_CSB5: + r->name = "ServerWorks"; + r->get = pirq_serverworks_get; + r->set = pirq_serverworks_set; + return 1; } return 0; } @@ -677,7 +670,7 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, { if (device != PCI_DEVICE_ID_SI_503) return 0; - + r->name = "SIS"; r->get = pirq_sis_get; r->set = pirq_sis_set; @@ -686,47 +679,43 @@ static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_CYRIX_5520: - r->name = "NatSemi"; - r->get = pirq_cyrix_get; - r->set = pirq_cyrix_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_CYRIX_5520: + r->name = "NatSemi"; + r->get = pirq_cyrix_get; + r->set = pirq_cyrix_set; + return 1; } return 0; } static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_OPTI_82C700: - r->name = "OPTI"; - r->get = pirq_opti_get; - r->set = pirq_opti_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_OPTI_82C700: + r->name = "OPTI"; + r->get = pirq_opti_get; + r->set = pirq_opti_set; + return 1; } return 0; } static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_ITE_IT8330G_0: - r->name = "ITE"; - r->get = pirq_ite_get; - r->set = pirq_ite_set; - return 1; + switch (device) { + case PCI_DEVICE_ID_ITE_IT8330G_0: + r->name = "ITE"; + r->get = pirq_ite_get; + r->set = pirq_ite_set; + return 1; } return 0; } static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { + switch (device) { case PCI_DEVICE_ID_AL_M1533: case PCI_DEVICE_ID_AL_M1563: printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); @@ -740,25 +729,24 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { - switch(device) - { - case PCI_DEVICE_ID_AMD_VIPER_740B: - r->name = "AMD756"; - break; - case PCI_DEVICE_ID_AMD_VIPER_7413: - r->name = "AMD766"; - break; - case PCI_DEVICE_ID_AMD_VIPER_7443: - r->name = "AMD768"; - break; - default: - return 0; + switch (device) { + case PCI_DEVICE_ID_AMD_VIPER_740B: + r->name = "AMD756"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7413: + r->name = "AMD766"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7443: + r->name = "AMD768"; + break; + default: + return 0; } r->get = pirq_amd756_get; r->set = pirq_amd756_set; return 1; } - + static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { switch (device) { @@ -800,7 +788,7 @@ static struct pci_dev *pirq_router_dev; * FIXME: should we have an option to say "generic for * chipset" ? */ - + static void __init pirq_find_router(struct irq_router *r) { struct irq_routing_table *rt = pirq_table; @@ -819,7 +807,7 @@ static void __init pirq_find_router(struct irq_router *r) r->name = "default"; r->get = NULL; r->set = NULL; - + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", rt->rtr_vendor, rt->rtr_device); @@ -830,7 +818,7 @@ static void __init pirq_find_router(struct irq_router *r) return; } - for( h = pirq_routers; h->vendor; h++) { + for (h = pirq_routers; h->vendor; h++) { /* First look for a router match */ if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) break; @@ -882,7 +870,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!pirq_table) return 0; - + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); info = pirq_get_info(dev); if (!info) { @@ -921,8 +909,10 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) */ newirq = dev->irq; if (newirq && !((1 << newirq) & mask)) { - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; - else printk("\n" KERN_WARNING + if (pci_probe & PCI_USE_PIRQ_MASK) + newirq = 0; + else + printk("\n" KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask " "- try pci=usepirqmask\n" KERN_DEBUG, newirq, pci_name(dev)); @@ -942,8 +932,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) irq = pirq & 0xf; DBG(" -> hardcoded IRQ %d\n", irq); msg = "Hardcoded"; - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { + } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { DBG(" -> got IRQ %d\n", irq); msg = "Found"; eisa_set_level_irq(irq); @@ -978,15 +968,15 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) continue; if (info->irq[pin].link == pirq) { /* We refuse to override the dev->irq information. Give a warning! */ - if ( dev2->irq && dev2->irq != irq && \ + if (dev2->irq && dev2->irq != irq && \ (!(pci_probe & PCI_USE_PIRQ_MASK) || \ - ((1 << dev2->irq) & mask)) ) { + ((1 << dev2->irq) & mask))) { #ifndef CONFIG_PCI_MSI - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", pci_name(dev2), dev2->irq, irq); #endif - continue; - } + continue; + } dev2->irq = irq; pirq_penalty[irq]++; if (dev != dev2) @@ -1024,8 +1014,7 @@ static void __init pcibios_fixup_irqs(void) /* * Recalculate IRQ numbers if we use the I/O APIC. */ - if (io_apic_assign_pci_irqs) - { + if (io_apic_assign_pci_irqs) { int irq; if (pin) { @@ -1038,10 +1027,10 @@ static void __init pcibios_fixup_irqs(void) * busses itself so we should get into this branch reliably. */ if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev * bridge = dev->bus->self; + struct pci_dev *bridge = dev->bus->self; pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", @@ -1131,7 +1120,7 @@ static int __init pcibios_irq_init(void) pirq_find_router(&pirq_router); if (pirq_table->exclusive_irqs) { int i; - for (i=0; i<16; i++) + for (i = 0; i < 16; i++) if (!(pirq_table->exclusive_irqs & (1 << i))) pirq_penalty[i] += 100; } @@ -1196,10 +1185,10 @@ static int pirq_enable_irq(struct pci_dev *dev) */ temp_dev = dev; while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev * bridge = dev->bus->self; + struct pci_dev *bridge = dev->bus->self; pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", -- cgit v1.1 From 5b80fe8bd732b5dd442118a43e02db22e8fd93e2 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 7 Jun 2008 14:34:42 +0200 Subject: x86: coding style fixes to arch/x86/kernel/sys_i386_32.c Before: total: 16 errors, 25 warnings, 246 lines checked After: total: 0 errors, 7 warnings, 246 lines checked Compile tested. paolo@paolo-desktop:/tmp$ size sys* text data bss dec hex filename 1209 0 0 1209 4b9 sys_i386_32.o.after 1209 0 0 1209 4b9 sys_i386_32.o.before paolo@paolo-desktop:/tmp$ md5sum sys* 6144f6d6ce7342c3e192681a1ccaa1c1 sys_i386_32.o.after 6144f6d6ce7342c3e192681a1ccaa1c1 sys_i386_32.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/sys_i386_32.c | 64 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d2ab52c..7066cb8 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -19,8 +19,8 @@ #include <linux/utsname.h> #include <linux/ipc.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/uaccess.h> +#include <linux/unistd.h> asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg) * * This is really horribly ugly. */ -asmlinkage int sys_ipc (uint call, int first, int second, +asmlinkage int sys_ipc(uint call, int first, int second, int third, void __user *ptr, long fifth) { int version, ret; @@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second, switch (call) { case SEMOP: - return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); + return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); case SEMTIMEDOP: return sys_semtimedop(first, (struct sembuf __user *)ptr, second, (const struct timespec __user *)fifth); case SEMGET: - return sys_semget (first, second, third); + return sys_semget(first, second, third); case SEMCTL: { union semun fourth; if (!ptr) return -EINVAL; if (get_user(fourth.__pad, (void __user * __user *) ptr)) return -EFAULT; - return sys_semctl (first, second, third, fourth); + return sys_semctl(first, second, third, fourth); } case MSGSND: - return sys_msgsnd (first, (struct msgbuf __user *) ptr, + return sys_msgsnd(first, (struct msgbuf __user *) ptr, second, third); case MSGRCV: switch (version) { @@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second, struct ipc_kludge tmp; if (!ptr) return -EINVAL; - + if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp))) + (struct ipc_kludge __user *) ptr, + sizeof(tmp))) return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, + return sys_msgrcv(first, tmp.msgp, second, tmp.msgtyp, third); } default: - return sys_msgrcv (first, + return sys_msgrcv(first, (struct msgbuf __user *) ptr, second, fifth, third); } case MSGGET: - return sys_msgget ((key_t) first, second); + return sys_msgget((key_t) first, second); case MSGCTL: - return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); + return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); case SHMAT: switch (version) { default: { ulong raddr; - ret = do_shmat (first, (char __user *) ptr, second, &raddr); + ret = do_shmat(first, (char __user *) ptr, second, &raddr); if (ret) return ret; - return put_user (raddr, (ulong __user *) third); + return put_user(raddr, (ulong __user *) third); } case 1: /* iBCS2 emulator entry point */ if (!segment_eq(get_fs(), get_ds())) return -EINVAL; /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat (first, (char __user *) ptr, second, (ulong *) third); + return do_shmat(first, (char __user *) ptr, second, (ulong *) third); } - case SHMDT: - return sys_shmdt ((char __user *)ptr); + case SHMDT: + return sys_shmdt((char __user *)ptr); case SHMGET: - return sys_shmget (first, second, third); + return sys_shmget(first, second, third); case SHMCTL: - return sys_shmctl (first, second, + return sys_shmctl(first, second, (struct shmid_ds __user *) ptr); default: return -ENOSYS; @@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second, /* * Old cruft */ -asmlinkage int sys_uname(struct old_utsname __user * name) +asmlinkage int sys_uname(struct old_utsname __user *name) { int err; if (!name) return -EFAULT; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - return err?-EFAULT:0; + return err? -EFAULT:0; } -asmlinkage int sys_olduname(struct oldold_utsname __user * name) +asmlinkage int sys_olduname(struct oldold_utsname __user *name) { int error; if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) return -EFAULT; - - down_read(&uts_sem); - + + down_read(&uts_sem); + error = __copy_to_user(&name->sysname, &utsname()->sysname, __OLD_UTS_LEN); error |= __put_user(0, name->sysname + __OLD_UTS_LEN); @@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name) error |= __copy_to_user(&name->machine, &utsname()->machine, __OLD_UTS_LEN); error |= __put_user(0, name->machine + __OLD_UTS_LEN); - + up_read(&uts_sem); - + error = error ? -EFAULT : 0; return error; @@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]) long __res; asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" : "=a" (__res) - : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); + : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); return __res; } -- cgit v1.1 From 6ddd2a27948f0bd02a2ad001e8a6816898eba0dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 16:59:53 +0200 Subject: x86: simplify idle selection default_idle is selected in cpu_idle(), when no other idle routine is selected. Select it in select_idle_routine() when mwait is not selected. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 18 +++++++----------- arch/x86/kernel/process_32.c | 7 +------ arch/x86/kernel/process_64.c | 7 ++----- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba370dc..b3078f4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -139,27 +139,23 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - static int selected; - - if (selected) - return; #ifdef CONFIG_X86_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { printk(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); } #endif + if (pm_idle) + return; + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* - * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ - if (!pm_idle) { - printk(KERN_INFO "using mwait in idle threads.\n"); - pm_idle = mwait_idle; - } - } - selected = 1; + printk(KERN_INFO "using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } else + pm_idle = default_idle; } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476df..ee4ab46 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -168,24 +168,19 @@ void cpu_idle(void) while (1) { tick_nohz_stop_sched_tick(); while (!need_resched()) { - void (*idle)(void); check_pgt_cache(); rmb(); - idle = pm_idle; if (rcu_pending(cpu)) rcu_check_callbacks(cpu, 0); - if (!idle) - idle = default_idle; - if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; - idle(); + pm_idle(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f3..db3d89a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -150,12 +150,9 @@ void cpu_idle(void) while (1) { tick_nohz_stop_sched_tick(); while (!need_resched()) { - void (*idle)(void); rmb(); - idle = pm_idle; - if (!idle) - idle = default_idle; + if (cpu_is_offline(smp_processor_id())) play_dead(); /* @@ -165,7 +162,7 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); - idle(); + pm_idle(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ -- cgit v1.1 From aa83f3f2cfc74d66d01b1d2eb1485ea1103a0f4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 17:11:13 +0200 Subject: x86: cleanup C1E enabled detection Rename the "MSR_K8_ENABLE_C1E" MSR to INT_PENDING_MSG, which is the name in the data sheet as well. Move the C1E mask to the header file. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd.c | 5 ++--- arch/x86/kernel/cpu/amd_64.c | 5 ++--- include/asm-x86/msr-index.h | 4 +++- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a38d54f..30b5055 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -25,7 +25,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); #ifdef CONFIG_X86_LOCAL_APIC -#define ENABLE_C1E_MASK 0x18000000 #define CPUID_PROCESSOR_SIGNATURE 1 #define CPUID_XFAM 0x0ff00000 #define CPUID_XFAM_K8 0x00000000 @@ -45,8 +44,8 @@ static __cpuinit int amd_apic_timer_broken(void) break; case CPUID_XFAM_10H: case CPUID_XFAM_11H: - rdmsr(MSR_K8_ENABLE_C1E, lo, hi); - if (lo & ENABLE_C1E_MASK) { + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { if (smp_processor_id() != boot_cpu_physical_apicid) printk(KERN_INFO "AMD C1E detected late. " " Force timer broadcast.\n"); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 626fc21..6eef3c7 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -110,7 +110,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -#define ENABLE_C1E_MASK 0x18000000 #define CPUID_PROCESSOR_SIGNATURE 1 #define CPUID_XFAM 0x0ff00000 #define CPUID_XFAM_K8 0x00000000 @@ -130,8 +129,8 @@ static __cpuinit int amd_apic_timer_broken(void) break; case CPUID_XFAM_10H: case CPUID_XFAM_11H: - rdmsr(MSR_K8_ENABLE_C1E, lo, hi); - if (lo & ENABLE_C1E_MASK) + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) return 1; break; default: diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h index 09413ad..44bce77 100644 --- a/include/asm-x86/msr-index.h +++ b/include/asm-x86/msr-index.h @@ -111,7 +111,9 @@ #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 #define MSR_K8_HWCR 0xc0010015 -#define MSR_K8_ENABLE_C1E 0xc0010055 +#define MSR_K8_INT_PENDING_MSG 0xc0010055 +/* C1E active bits in int pending message */ +#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 #define MSR_K8_TSEG_ADDR 0xc0010112 #define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */ #define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */ -- cgit v1.1 From 732d7be17b98ebfd59e5864c3490f19856fa832c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 17:27:20 +0200 Subject: x86: use cpuinfo to check for interrupt pending message msr Simplify code: no need to do a cpuid(1) again. The cpuinfo structure has all necessary information already. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd.c | 41 +++++++++++++++-------------------------- arch/x86/kernel/cpu/amd_64.c | 38 +++++++++++++++----------------------- 2 files changed, 30 insertions(+), 49 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 30b5055..e76b49e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -25,35 +25,24 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); #ifdef CONFIG_X86_LOCAL_APIC -#define CPUID_PROCESSOR_SIGNATURE 1 -#define CPUID_XFAM 0x0ff00000 -#define CPUID_XFAM_K8 0x00000000 -#define CPUID_XFAM_10H 0x00100000 -#define CPUID_XFAM_11H 0x00200000 -#define CPUID_XMOD 0x000f0000 -#define CPUID_XMOD_REV_F 0x00040000 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(void) +static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) { u32 lo, hi; - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - switch (eax & CPUID_XFAM) { - case CPUID_XFAM_K8: - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) - break; - case CPUID_XFAM_10H: - case CPUID_XFAM_11H: - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - if (smp_processor_id() != boot_cpu_physical_apicid) - printk(KERN_INFO "AMD C1E detected late. " - " Force timer broadcast.\n"); - return 1; - } - break; - default: - /* err on the side of caution */ + + if (c->x86 < 0x0F) + return 0; + + /* Family 0x0f models < rev F do not have this MSR */ + if (c->x86 == 0x0f && c->x86_model < 0x40) + return 0; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + if (smp_processor_id() != boot_cpu_physical_apicid) + printk(KERN_INFO "AMD C1E detected late. " + "Force timer broadcast.\n"); return 1; } return 0; @@ -297,7 +286,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_LOCAL_APIC - if (amd_apic_timer_broken()) + if (amd_apic_timer_broken(c)) local_apic_timer_disabled = 1; #endif diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 6eef3c7..f5fc161 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -110,31 +110,23 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -#define CPUID_PROCESSOR_SIGNATURE 1 -#define CPUID_XFAM 0x0ff00000 -#define CPUID_XFAM_K8 0x00000000 -#define CPUID_XFAM_10H 0x00100000 -#define CPUID_XFAM_11H 0x00200000 -#define CPUID_XMOD 0x000f0000 -#define CPUID_XMOD_REV_F 0x00040000 - /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(void) +static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) { - u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + u32 lo, hi; - switch (eax & CPUID_XFAM) { - case CPUID_XFAM_K8: - if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) - break; - case CPUID_XFAM_10H: - case CPUID_XFAM_11H: - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) - return 1; - break; - default: - /* err on the side of caution */ + if (c->x86 < 0x0F) + return 0; + + /* Family 0x0f models < rev F do not have this MSR */ + if (c->x86 == 0x0f && c->x86_model < 0x40) + return 0; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + if (smp_processor_id() != boot_cpu_physical_apicid) + printk(KERN_INFO "AMD C1E detected late. " + "Force timer broadcast.\n"); return 1; } return 0; @@ -220,7 +212,7 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) amd_enable_pci_ext_cfg(c); - if (amd_apic_timer_broken()) + if (amd_apic_timer_broken(c)) disable_apic_timer = 1; if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { -- cgit v1.1 From 09fd4b4ef5bc7e5222c13acec1bee8cd252fb63f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 18:04:27 +0200 Subject: x86: use cpuid to check MWAIT support for C1 cpuid(0x05) provides extended information about MWAIT in EDX when bit 0 of ECX is set. Bit 4-7 of EDX determine whether MWAIT is supported for C1. C1E enabled CPUs have these bits set to 0. Based on an earlier patch from Andi Kleen. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b3078f4..fe415ba 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -122,19 +122,31 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ + +#define MWAIT_INFO 0x05 +#define MWAIT_ECX_EXTENDED_INFO 0x01 +#define MWAIT_EDX_C1 0xf0 + static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) { + u32 eax, ebx, ecx, edx; + if (force_mwait) return 1; - if (c->x86_vendor == X86_VENDOR_AMD) { - switch(c->x86) { - case 0x10: - case 0x11: - return 0; - } - } - return 1; + if (c->cpuid_level < MWAIT_INFO) + return 0; + + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); + /* Check, whether EDX has extended info about MWAIT */ + if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) + return 1; + + /* + * edx enumeratios MONITOR/MWAIT extensions. Check, whether + * C1 supports MWAIT + */ + return (edx & MWAIT_EDX_C1); } void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -- cgit v1.1 From 00dba56465228825ea806e3a7fc0aa6bba7bdc6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 18:35:28 +0200 Subject: x86: move more common idle functions/variables to process.c more unification. Should cause no change in functionality. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 54 ---------------------------------- arch/x86/kernel/process_64.c | 28 ------------------ 3 files changed, 70 insertions(+), 82 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fe415ba..9fea146 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -45,6 +45,76 @@ void arch_task_cache_init(void) SLAB_PANIC, NULL); } +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); + +#ifdef CONFIG_X86_32 +/* + * This halt magic was a workaround for ancient floppy DMA + * wreckage. It should be safe to remove. + */ +static int hlt_counter; +void disable_hlt(void) +{ + hlt_counter++; +} +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} +EXPORT_SYMBOL(enable_hlt); + +static inline int hlt_use_halt(void) +{ + return (!hlt_counter && boot_cpu_data.hlt_works_ok); +} +#else +static inline int hlt_use_halt(void) +{ + return 1; +} +#endif + +/* + * We use this if we don't have any better + * idle routine.. + */ +void default_idle(void) +{ + if (hlt_use_halt()) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + } else { + local_irq_enable(); + /* loop is done by the caller */ + cpu_relax(); + } +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + static void do_nothing(void *unused) { } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ee4ab46..ae40204 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -58,11 +58,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -static int hlt_counter; - -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); @@ -77,55 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); - -void disable_hlt(void) -{ - hlt_counter++; -} - -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - hlt_counter--; -} - -EXPORT_SYMBOL(enable_hlt); - -/* - * We use this if we don't have any better - * idle routine.. - */ -void default_idle(void) -{ - if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - } else { - local_irq_enable(); - /* loop is done by the caller */ - cpu_relax(); - } -} -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(default_idle); -#endif - #ifdef CONFIG_HOTPLUG_CPU #include <asm/nmi.h> /* We don't actually take CPU down, just spin without interrupts. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index db3d89a..9fb3a6f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void); unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -94,25 +85,6 @@ void exit_idle(void) __exit_idle(); } -/* - * We use this if we don't have any better - * idle routine.. - */ -void default_idle(void) -{ - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; -} - #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); -- cgit v1.1 From c26421d01986e1521043c8feb47256833df3bf31 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu, 29 May 2008 12:01:44 -0700 Subject: x86: fix Xorg crash with xf86MapVidMem error Clarify the usage of mtrr_lookup() in PAT code, and to make PAT code resilient to mtrr lookup problems. Specifically, pat_x_mtrr_type() is restructured to highlight, under what conditions we look for mtrr hint. pat_x_mtrr_type() uses a default type when there are any errors in mtrr lookup (still maintaining the pat consistency). And, reserve_memtype() highlights its usage ot mtrr_lookup for request type of '-1' and also defaults in a sane way on any mtrr lookup failure. pat.c looks at mtrr type of a range to get a hint on what mapping type to request when user/API: (1) hasn't specified any type (/dev/mem mapping) and we do not want to take performance hit by always mapping UC_MINUS. This will be the case for /dev/mem mappings used to map BIOS area or ACPI region which are WB'able. In this case, as long as MTRR is not WB, PAT will request UC_MINUS for such mappings. (2) user/API requests WB mapping while in reality MTRR may have UC or WC. In this case, PAT can map as WB (without checking MTRR) and still effective type will be UC or WC. But, a subsequent request to map same region as UC or WC may fail, as the region will get trackked as WB in PAT list. Looking at MTRR hint helps us to track based on effective type rather than what user requested. Again, here mtrr_lookup is only used as hint and we fallback to WB mapping (as requested by user) as default. In both cases, after using the mtrr hint, we still go through the memtype list to make sure there are no inconsistencies among multiple users. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Tested-by: Rufus & Azrael <rufus-azrael@numericable.fr> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e83b770..320d644 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -164,32 +164,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, unsigned long pat_type; u8 mtrr_type; - mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFF) { /* MTRR not enabled */ - *ret_prot = prot; - return 0; - } - if (mtrr_type == 0xFE) { /* MTRR match error */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - if (mtrr_type != MTRR_TYPE_UNCACHABLE && - mtrr_type != MTRR_TYPE_WRBACK && - mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - pat_type = prot & _PAGE_CACHE_MASK; prot &= (~_PAGE_CACHE_MASK); - /* Currently doing intersection by hand. Optimize it later. */ + /* + * We return the PAT request directly for types where PAT takes + * precedence with respect to MTRR and for UC_MINUS. + * Consistency checks with other PAT requests is done later + * while going through memtype list. + */ if (pat_type == _PAGE_CACHE_WC) { *ret_prot = prot | _PAGE_CACHE_WC; + return 0; } else if (pat_type == _PAGE_CACHE_UC_MINUS) { *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - } else if (pat_type == _PAGE_CACHE_UC || - mtrr_type == MTRR_TYPE_UNCACHABLE) { + return 0; + } else if (pat_type == _PAGE_CACHE_UC) { + *ret_prot = prot | _PAGE_CACHE_UC; + return 0; + } + + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + mtrr_type = mtrr_type_lookup(start, end); + + if (mtrr_type == MTRR_TYPE_UNCACHABLE) { *ret_prot = prot | _PAGE_CACHE_UC; } else if (mtrr_type == MTRR_TYPE_WRCOMB) { *ret_prot = prot | _PAGE_CACHE_WC; @@ -246,14 +247,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (req_type == -1) { /* - * Special case where caller wants to inherit from mtrr or - * existing pat mapping, defaulting to UC_MINUS in case of - * no match. + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. */ u8 mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFE) { /* MTRR match error */ - err = -1; - } if (mtrr_type == MTRR_TYPE_WRBACK) { req_type = _PAGE_CACHE_WB; -- cgit v1.1 From ee863ba7ab3d3ed8a9585d378aae69d1e3e9f1b4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Tue, 10 Jun 2008 16:04:30 +0200 Subject: x86: unconditionally enable PAT for AMD CPUs If PAT support is advertised it should just work. No errata known. Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index d8b3e4a..0fbd062 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -54,14 +54,11 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { - case X86_VENDOR_AMD: - if (c->x86 >= 0xf && c->x86 <= 0x11) - return; - break; case X86_VENDOR_INTEL: if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) return; break; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: return; -- cgit v1.1 From 97cfab6ac4ddfda0d722393bbf46cc40bc332107 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Tue, 10 Jun 2008 16:05:18 +0200 Subject: x86: PAT: fix ambiguous paranoia check in pat_init() Starting with commit 8d4a4300854f3971502e81dacd930704cb88f606 (x86: cleanup PAT cpu validation) the PAT CPU feature flag is not cleared anymore. Now the error message "PAT enabled, but CPU feature cleared" in pat_init() is misleading. Furthermore the current code does not check for existence of the PAT CPU feature flag if a CPU is whitelisted in validate_pat_support. This patch clears pat_wc_enabled if boot CPU has no PAT feature flag and adapts the paranoia check. Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/addon_cpuid_features.c | 7 ++++--- arch/x86/mm/pat.c | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 0fbd062..2df461f 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -53,6 +53,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_PAT void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) { + if (!cpu_has_pat) + pat_disable("PAT not supported by CPU."); + switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) @@ -64,8 +67,6 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) return; } - pat_disable(cpu_has_pat ? - "PAT disabled. Not yet verified on this CPU type." : - "PAT not supported by CPU."); + pat_disable("PAT disabled. Not yet verified on this CPU type."); } #endif diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 320d644..65105b1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -76,14 +76,15 @@ void pat_init(void) return; /* Paranoia check. */ - if (!cpu_has_pat) { - printk(KERN_ERR "PAT enabled, but CPU feature cleared\n"); + if (!cpu_has_pat && boot_pat_state) { /* - * Panic if this happens on the secondary CPU, and we + * If this happens we are on a secondary CPU, but * switched to PAT on the boot CPU. We have no way to * undo PAT. - */ - BUG_ON(boot_pat_state); + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); } /* Set PWT to Write-Combining. All other bits stay the same */ -- cgit v1.1 From cd7a4e936d345ab4cb49d68192d90bd4e4c58458 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Tue, 10 Jun 2008 16:05:39 +0200 Subject: x86: PAT: fixed checkpatch errors (and whitespaces) x86: PAT: fixed checkpatch errors (and whitespaces) Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 -- arch/x86/mm/pat.c | 29 ++++++++++++++--------------- include/asm-x86/pat.h | 6 ++---- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 2df461f..84a8220 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -1,9 +1,7 @@ - /* * Routines to indentify additional cpu features that are scattered in * cpuid space. */ - #include <linux/cpu.h> #include <asm/pat.h> diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 65105b1..a8b69bb 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -66,7 +66,7 @@ enum { PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ }; -#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) void pat_init(void) { @@ -100,8 +100,8 @@ void pat_init(void) * 011 UC _PAGE_CACHE_UC * PAT bit unused */ - pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | - PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); /* Boot CPU check */ if (!boot_pat_state) @@ -117,11 +117,11 @@ void pat_init(void) static char *cattr_name(unsigned long flags) { switch (flags & _PAGE_CACHE_MASK) { - case _PAGE_CACHE_UC: return "uncached"; - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; - case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - default: return "broken"; + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + default: return "broken"; } } @@ -536,11 +536,11 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * we maintain the tradition of paranoia in this code. */ if (!pat_wc_enabled && - ! ( boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + !(boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { flags = _PAGE_CACHE_UC; } #endif @@ -562,7 +562,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, return 0; if (pfn <= max_pfn_mapped && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", @@ -600,4 +600,3 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } - diff --git a/include/asm-x86/pat.h b/include/asm-x86/pat.h index 88f60cc..6fa9710 100644 --- a/include/asm-x86/pat.h +++ b/include/asm-x86/pat.h @@ -1,6 +1,5 @@ - #ifndef _ASM_PAT_H -#define _ASM_PAT_H 1 +#define _ASM_PAT_H #include <linux/types.h> @@ -8,7 +7,7 @@ extern int pat_wc_enabled; extern void validate_pat_support(struct cpuinfo_x86 *c); #else -static const int pat_wc_enabled = 0; +static const int pat_wc_enabled; static inline void validate_pat_support(struct cpuinfo_x86 *c) { } #endif @@ -21,4 +20,3 @@ extern int free_memtype(u64 start, u64 end); extern void pat_disable(char *reason); #endif - -- cgit v1.1 From 499f8f84b8324ba27d756e03f373fa16eeed9ccc Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Tue, 10 Jun 2008 16:06:21 +0200 Subject: x86: rename pat_wc_enabled to pat_enabled BTW, what does pat_wc_enabled stand for? Does it mean "write-combining"? Currently it is used to globally switch on or off PAT support. Thus I renamed it to pat_enabled. I think this increases readability (and hope that I didn't miss something). Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pat.c | 16 ++++++++-------- arch/x86/pci/i386.c | 4 ++-- include/asm-x86/pat.h | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 71bb315..ddeafed 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -261,7 +261,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use * UC MINUS. @@ -285,7 +285,7 @@ EXPORT_SYMBOL(ioremap_nocache); */ void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) { - if (pat_wc_enabled) + if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, __builtin_return_address(0)); else diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b..6916fe4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -805,7 +805,7 @@ int _set_memory_wc(unsigned long addr, int numpages) int set_memory_wc(unsigned long addr, int numpages) { - if (!pat_wc_enabled) + if (!pat_enabled) return set_memory_uc(addr, numpages); if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a8b69bb..4beccea 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -26,11 +26,11 @@ #include <asm/io.h> #ifdef CONFIG_X86_PAT -int __read_mostly pat_wc_enabled = 1; +int __read_mostly pat_enabled = 1; void __cpuinit pat_disable(char *reason) { - pat_wc_enabled = 0; + pat_enabled = 0; printk(KERN_INFO "%s\n", reason); } @@ -72,7 +72,7 @@ void pat_init(void) { u64 pat; - if (!pat_wc_enabled) + if (!pat_enabled) return; /* Paranoia check. */ @@ -225,8 +225,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; int err = 0; - /* Only track when pat_wc_enabled */ - if (!pat_wc_enabled) { + /* Only track when pat_enabled */ + if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (ret_type) { if (req_type == -1) { @@ -440,8 +440,8 @@ int free_memtype(u64 start, u64 end) struct memtype *ml; int err = -EINVAL; - /* Only track when pat_wc_enabled */ - if (!pat_wc_enabled) { + /* Only track when pat_enabled */ + if (!pat_enabled) { return 0; } @@ -535,7 +535,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_wc_enabled && + if (!pat_enabled && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 10fb308..6ccd7a1 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -299,9 +299,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, return -EINVAL; prot = pgprot_val(vma->vm_page_prot); - if (pat_wc_enabled && write_combine) + if (pat_enabled && write_combine) prot |= _PAGE_CACHE_WC; - else if (pat_wc_enabled || boot_cpu_data.x86 > 3) + else if (pat_enabled || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here diff --git a/include/asm-x86/pat.h b/include/asm-x86/pat.h index 6fa9710..7edc473 100644 --- a/include/asm-x86/pat.h +++ b/include/asm-x86/pat.h @@ -4,10 +4,10 @@ #include <linux/types.h> #ifdef CONFIG_X86_PAT -extern int pat_wc_enabled; +extern int pat_enabled; extern void validate_pat_support(struct cpuinfo_x86 *c); #else -static const int pat_wc_enabled; +static const int pat_enabled; static inline void validate_pat_support(struct cpuinfo_x86 *c) { } #endif -- cgit v1.1 From bf07dc864902b3e788de5ab50dc62d5677da90f2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rjw@sisk.pl> Date: Thu, 12 Jun 2008 10:27:08 +0200 Subject: x86: remove obsolete PM definitions from NMI header Remove obsolete and no longer used PM-related definitions from include/asm-x86/nmi.h. Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/nmi.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 8455bf3..621bd09 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -15,27 +15,6 @@ */ int do_nmi_callback(struct pt_regs *regs, int cpu); -#ifdef CONFIG_PM - -/** Replace the PM callback routine for NMI. */ -struct pm_dev *set_nmi_pm_callback(pm_callback callback); - -/** Unset the PM callback routine back to the default. */ -void unset_nmi_pm_callback(struct pm_dev *dev); - -#else - -static inline struct pm_dev *set_nmi_pm_callback(pm_callback callback) -{ - return 0; -} - -static inline void unset_nmi_pm_callback(struct pm_dev *dev) -{ -} - -#endif /* CONFIG_PM */ - #ifdef CONFIG_X86_64 extern void default_do_nmi(struct pt_regs *); extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -- cgit v1.1 From 6703f6d10dcd3316e03641a5ecaa6c8a04374d98 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rjw@sisk.pl> Date: Tue, 10 Jun 2008 00:10:48 +0200 Subject: x86, gart: add resume handling If GART IOMMU is used on an AMD64 system, the northbridge registers related to it should be restored during resume so that memory is not corrupted. Make gart_resume() handle that as appropriate. Ref. http://lkml.org/lkml/2008/5/25/96 and the following thread. Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 2 ++ arch/x86/kernel/pci-gart_64.c | 57 +++++++++++++++++++++++++++++++++++++++---- include/asm-x86/gart.h | 1 + 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index eb20f16..3409abb 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -496,4 +496,6 @@ out: write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } + + set_up_gart_resume(aper_order, aper_alloc); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 3710097..f505c38 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -549,14 +549,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) return aper_base; } +static void enable_gart_translations(void) +{ + int i; + + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; + + enable_gart_translation(dev, __pa(agp_gatt_table)); + } +} + +/* + * If fix_up_north_bridges is set, the north bridges have to be fixed up on + * resume in the same way as they are handled in gart_iommu_hole_init(). + */ +static bool fix_up_north_bridges; +static u32 aperture_order; +static u32 aperture_alloc; + +void set_up_gart_resume(u32 aper_order, u32 aper_alloc) +{ + fix_up_north_bridges = true; + aperture_order = aper_order; + aperture_alloc = aper_alloc; +} + static int gart_resume(struct sys_device *dev) { + printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + + if (fix_up_north_bridges) { + int i; + + printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; + + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, + aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, + aperture_alloc >> 25); + } + } + + enable_gart_translations(); + return 0; } static int gart_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + return 0; } static struct sysdev_class gart_sysdev_class = { @@ -614,16 +663,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) memset(gatt, 0, gatt_size); agp_gatt_table = gatt; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; - enable_gart_translation(dev, __pa(gatt)); - } + enable_gart_translations(); error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); if (error) panic("Could not register gart_sysdev -- would corrupt data on next suspend"); + flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index c818b96..eeca2f5 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -14,6 +14,7 @@ extern void gart_iommu_shutdown(void); extern void __init gart_parse_options(char *); extern void early_gart_iommu_check(void); extern void gart_iommu_hole_init(void); +extern void set_up_gart_resume(u32, u32); extern int fallback_aper_order; extern int fallback_aper_force; extern int gart_iommu_aperture; -- cgit v1.1 From f781b03c4b1c713ac000877c8bbc31fc4164a29b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Thu, 12 Jun 2008 17:08:16 +0400 Subject: x86: touch_nmi_watchdog(): reset alert counters for supported nmi_watchdog modes only The checking 'if nmi_watchdog > 0' (ie NMI_NONE) is quite fast but it has a side effect - it's taken even if nmi_watchdog = NMI_DISABLED. Nowadays nmi_watchdog is set up to NMI_NONE by default so this condition is properly taken most the time but we better show this explicitly. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 19f1b95..326a8f4 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -354,7 +354,8 @@ static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) { - if (nmi_watchdog > 0) { + if (nmi_watchdog == NMI_LOCAL_APIC || + nmi_watchdog == NMI_IO_APIC) { unsigned cpu; /* -- cgit v1.1 From e6e07d8a2d2989c1f42287131308aa2fde253631 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 16:08:17 -0700 Subject: x86: make asm/asm.h work for asm code. This is useful for unifying some pieces of asm code. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- include/asm-x86/asm.h | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/include/asm-x86/asm.h b/include/asm-x86/asm.h index 90dec0c..7093982 100644 --- a/include/asm-x86/asm.h +++ b/include/asm-x86/asm.h @@ -1,33 +1,29 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H -#ifdef CONFIG_X86_32 -/* 32 bits */ - -# define _ASM_PTR " .long " -# define _ASM_ALIGN " .balign 4 " -# define _ASM_MOV_UL " movl " - -# define _ASM_INC " incl " -# define _ASM_DEC " decl " -# define _ASM_ADD " addl " -# define _ASM_SUB " subl " -# define _ASM_XADD " xaddl " +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x) x +#else +# define __ASM_FORM(x) " " #x " " +#endif +#ifdef CONFIG_X86_32 +# define __ASM_SEL(a,b) __ASM_FORM(a) #else -/* 64 bits */ +# define __ASM_SEL(a,b) __ASM_FORM(b) +#endif -# define _ASM_PTR " .quad " -# define _ASM_ALIGN " .balign 8 " -# define _ASM_MOV_UL " movq " +#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) -# define _ASM_INC " incq " -# define _ASM_DEC " decq " -# define _ASM_ADD " addq " -# define _ASM_SUB " subq " -# define _ASM_XADD " xaddq " +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) +#define _ASM_MOV_UL __ASM_SIZE(mov) -#endif /* CONFIG_X86_32 */ +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) /* Exception table entry */ # define _ASM_EXTABLE(from,to) \ -- cgit v1.1 From e01b70ef3eb3080fecc35e15f68cd274c0a48163 Mon Sep 17 00:00:00 2001 From: Jiri Hladky <hladky.jiri@googlemail.com> Date: Mon, 2 Jun 2008 12:00:19 +0200 Subject: x86: fix bug in arch/i386/lib/delay.c file, delay_loop function when trying to understand how Bogomips are implemented I have found a bug in arch/i386/lib/delay.c file, delay_loop function. The function fails for loops > 2^31+1. It because SF is set when dec returns numbers > 2^31. The fix is to use jnz instruction instead of jns (and add one decl instruction to the end to have exactly the same number of loops as in original version). Martin Mares observed: > It is a long time since I have hacked that file, but you should definitely > make sure that the function is never called with a zero argument. In such > case, the original version made just a single pass, but your version > makes 2^32 of them. fixed that. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/lib/delay_32.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index d710f2d..ef69131 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -3,6 +3,7 @@ * * Copyright (C) 1993 Linus Torvalds * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com> * * The __delay function must _NOT_ be inlined as its execution time * depends wildly on alignment on many x86 processors. The additional @@ -28,16 +29,22 @@ /* simple loop based delay: */ static void delay_loop(unsigned long loops) { - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); + " test %0,%0 \n" + " jz 3f \n" + " jmp 1f \n" + + ".align 16 \n" + "1: jmp 2f \n" + + ".align 16 \n" + "2: decl %0 \n" + " jnz 2b \n" + "3: decl %0 \n" + + : /* we don't need output */ + :"a" (loops) + ); } /* TSC based delay: */ -- cgit v1.1 From b4b3bd96f26586e53ab5482f1869221dd1b5ac36 Mon Sep 17 00:00:00 2001 From: Daniel Rahn <Daniel.Rahn@novell.com> Date: Fri, 6 Jun 2008 09:42:36 +0200 Subject: x86: correctly report NR_BANKS in mce_64.c attached is a no-brainer that makes kernel correctly report NR_BANKS for MCE. We are right now limited to NR_BANKS==6, but the error message will use the available number of banks instead of the defined maximum. For a Nehalem based system it will print: "MCE: warning: using only 9 banks" while the correct message would be "MCE: warning: using only 6 banks" Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index f1f3f5e..8c8299c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -445,9 +445,9 @@ static void mce_init(void *dummy) rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; if (banks > MCE_EXTENDED_BANK) { + banks = MCE_EXTENDED_BANK; printk(KERN_INFO "MCE: warning: using only %d banks\n", MCE_EXTENDED_BANK); - banks = MCE_EXTENDED_BANK; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) -- cgit v1.1 From 6cf514fce18589ea1e0521c5f2d7c2bb280fefc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins <hugh@veritas.com> Date: Mon, 16 Jun 2008 18:42:43 +0100 Subject: x86: PAT: make pat_x_mtrr_type() more readable Clean up over-complications in pat_x_mtrr_type(). And if reserve_memtype() ignores stray req_type bits when pat_enabled, it's better to mask them off when not also. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7c21572..ac3a2b1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -159,47 +159,31 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ -static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, - unsigned long *ret_prot) +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) { - unsigned long pat_type; u8 mtrr_type; - pat_type = prot & _PAGE_CACHE_MASK; - prot &= (~_PAGE_CACHE_MASK); - /* * We return the PAT request directly for types where PAT takes * precedence with respect to MTRR and for UC_MINUS. * Consistency checks with other PAT requests is done later * while going through memtype list. */ - if (pat_type == _PAGE_CACHE_WC) { - *ret_prot = prot | _PAGE_CACHE_WC; - return 0; - } else if (pat_type == _PAGE_CACHE_UC_MINUS) { - *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - return 0; - } else if (pat_type == _PAGE_CACHE_UC) { - *ret_prot = prot | _PAGE_CACHE_UC; - return 0; - } + if (req_type == _PAGE_CACHE_WC || + req_type == _PAGE_CACHE_UC_MINUS || + req_type == _PAGE_CACHE_UC) + return req_type; /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ mtrr_type = mtrr_type_lookup(start, end); - - if (mtrr_type == MTRR_TYPE_UNCACHABLE) { - *ret_prot = prot | _PAGE_CACHE_UC; - } else if (mtrr_type == MTRR_TYPE_WRCOMB) { - *ret_prot = prot | _PAGE_CACHE_WC; - } else { - *ret_prot = prot | _PAGE_CACHE_WB; - } - - return 0; + if (mtrr_type == MTRR_TYPE_UNCACHABLE) + return _PAGE_CACHE_UC; + if (mtrr_type == MTRR_TYPE_WRCOMB) + return _PAGE_CACHE_WC; + return _PAGE_CACHE_WB; } /* @@ -232,7 +216,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (req_type == -1) { *ret_type = _PAGE_CACHE_WB; } else { - *ret_type = req_type; + *ret_type = req_type & _PAGE_CACHE_MASK; } } return 0; @@ -264,14 +248,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } } else { req_type &= _PAGE_CACHE_MASK; - err = pat_x_mtrr_type(start, end, req_type, &actual_type); - } - - if (err) { - if (ret_type) - *ret_type = actual_type; - - return -EINVAL; + actual_type = pat_x_mtrr_type(start, end, req_type); } new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); -- cgit v1.1 From 0db125c467afcbcc229abb1a87bc36ef72777dc2 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Tue, 10 Jun 2008 23:45:45 +0200 Subject: x86: more header fixes Summary: Add missing include guards for some x86 headers. This has only had the most rudimentary testing, but is hopefully obviously correct. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/seccomp_64.h | 1 + include/asm-x86/suspend_32.h | 5 +++++ include/asm-x86/xor_32.h | 5 +++++ include/asm-x86/xor_64.h | 5 +++++ 4 files changed, 16 insertions(+) diff --git a/include/asm-x86/seccomp_64.h b/include/asm-x86/seccomp_64.h index 553af65..76cfe69 100644 --- a/include/asm-x86/seccomp_64.h +++ b/include/asm-x86/seccomp_64.h @@ -1,4 +1,5 @@ #ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H #include <linux/thread_info.h> diff --git a/include/asm-x86/suspend_32.h b/include/asm-x86/suspend_32.h index 24e1c08..8675c67 100644 --- a/include/asm-x86/suspend_32.h +++ b/include/asm-x86/suspend_32.h @@ -3,6 +3,9 @@ * Based on code * Copyright 2001 Patrick Mochel <mochel@osdl.org> */ +#ifndef __ASM_X86_32_SUSPEND_H +#define __ASM_X86_32_SUSPEND_H + #include <asm/desc.h> #include <asm/i387.h> @@ -44,3 +47,5 @@ static inline void acpi_save_register_state(unsigned long return_point) /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); #endif + +#endif /* __ASM_X86_32_SUSPEND_H */ diff --git a/include/asm-x86/xor_32.h b/include/asm-x86/xor_32.h index 067b5c1..921b458 100644 --- a/include/asm-x86/xor_32.h +++ b/include/asm-x86/xor_32.h @@ -1,3 +1,6 @@ +#ifndef ASM_X86__XOR_32_H +#define ASM_X86__XOR_32_H + /* * Optimized RAID-5 checksumming functions for MMX and SSE. * @@ -881,3 +884,5 @@ do { \ deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + +#endif /* ASM_X86__XOR_32_H */ diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h index 24957e3..2d3a18d 100644 --- a/include/asm-x86/xor_64.h +++ b/include/asm-x86/xor_64.h @@ -1,3 +1,6 @@ +#ifndef ASM_X86__XOR_64_H +#define ASM_X86__XOR_64_H + /* * Optimized RAID-5 checksumming functions for MMX and SSE. * @@ -354,3 +357,5 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) + +#endif /* ASM_X86__XOR_64_H */ -- cgit v1.1 From fe94ae995d33a4df35b6b9cd0504e87d7e37c8de Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 14 Jun 2008 14:06:19 +0200 Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/p4.c Before: total: 16 errors, 34 warnings, 257 lines checked After: total: 0 errors, 2 warnings, 257 lines checked No changes in the compiled code: paolo@paolo-desktop:~/linux.trees.git$ size /tmp/p4* text data bss dec hex filename 2644 4 4 2652 a5c /tmp/p4.o.after 2644 4 4 2652 a5c /tmp/p4.o.before paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/p4* 13f1b21c4246b31a28aaff38184586ca /tmp/p4.o.after 13f1b21c4246b31a28aaff38184586ca /tmp/p4.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/p4.c | 90 ++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index cb03345..eef001a 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -8,7 +8,7 @@ #include <linux/interrupt.h> #include <linux/smp.h> -#include <asm/processor.h> +#include <asm/processor.h> #include <asm/system.h> #include <asm/msr.h> #include <asm/apic.h> @@ -32,12 +32,12 @@ struct intel_mce_extended_msrs { /* u32 *reserved[]; */ }; -static int mce_num_extended_msrs = 0; +static int mce_num_extended_msrs; #ifdef CONFIG_X86_MCE_P4THERMAL static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ +{ printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", smp_processor_id()); add_taint(TAINT_MACHINE_CHECK); @@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) * be some SMM goo which handles it, so we can't even put a handler * since it might be delivered via SMI already -zwanem. */ - rdmsr (MSR_IA32_MISC_ENABLE, l, h); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); if ((l & (1<<3)) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", @@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; /* -EBUSY */ } - /* check whether a vector already exists, temporarily masked? */ + /* check whether a vector already exists, temporarily masked? */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " "installed\n", @@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ apic_write_around(APIC_LVTTHMR, h); - rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); /* ok we're good to go... */ vendor_thermal_interrupt = intel_thermal_interrupt; - - rdmsr (MSR_IA32_MISC_ENABLE, l, h); - wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - l = apic_read (APIC_LVTTHMR); - apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); @@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); - rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr (MSR_IA32_MCG_EDX, r->edx, h); - rdmsr (MSR_IA32_MCG_ESI, r->esi, h); - rdmsr (MSR_IA32_MCG_EDI, r->edi, h); - rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr (MSR_IA32_MCG_ESP, r->esp, h); - rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr (MSR_IA32_MCG_EIP, r->eip, h); + rdmsr(MSR_IA32_MCG_EAX, r->eax, h); + rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); + rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); + rdmsr(MSR_IA32_MCG_EDX, r->edx, h); + rdmsr(MSR_IA32_MCG_ESI, r->esi, h); + rdmsr(MSR_IA32_MCG_EDI, r->edi, h); + rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); + rdmsr(MSR_IA32_MCG_ESP, r->esp, h); + rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); + rdmsr(MSR_IA32_MCG_EIP, r->eip, h); } -static void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs *regs, long error_code) { - int recover=1; + int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; + recover = 0; printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); @@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) } if (recover & 2) - panic ("CPU context corrupt"); + panic("CPU context corrupt"); if (recover & 1) - panic ("Unable to continue"); + panic("Unable to continue"); printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not * recoverable/continuable.This will allow BIOS to look at the MSRs * for errors if the OS could not log the error. */ - for (i=0; i<nr_mce_banks; i++) { + for (i = 0; i < nr_mce_banks; i++) { u32 msr; msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr (msr, low, high); + rdmsr(msr, low, high); if (high&(1<<31)) { /* Clear it */ wrmsr(msr, 0UL, 0UL); @@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) } } mcgstl &= ~(1<<2); - wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); + wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } @@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; - + machine_check_vector = intel_machine_check; wmb(); - printk (KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr (MSR_IA32_MCG_CAP, l, h); + printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ - wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; - for (i=0; i<nr_mce_banks; i++) { - wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + for (i = 0; i < nr_mce_banks; i++) { + wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); } - set_in_cr4 (X86_CR4_MCE); - printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id()); /* Check for P4/Xeon extended MCE MSRs */ - rdmsr (MSR_IA32_MCG_CAP, l, h); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<9)) {/* MCG_EXT_P */ mce_num_extended_msrs = (l >> 16) & 0xff; - printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" + printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" " available\n", smp_processor_id(), mce_num_extended_msrs); -- cgit v1.1 From 5175676a2d012ca5e5ad5eaedbfc1da5d5660d2a Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 14 Jun 2008 14:37:14 +0200 Subject: x86: coding style fixes to arch/x86/kernel/cpu/mcheck/k7.c Before: total: 6 errors, 13 warnings, 105 lines checked After: total: 0 errors, 0 warnings, 105 lines checked paolo@paolo-desktop:~/linux.trees.git$ size /tmp/k7* text data bss dec hex filename 1135 0 0 1135 46f /tmp/k7.o.after 1135 0 0 1135 46f /tmp/k7.o.before paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/k7* 87b14954045aa37dbaee6fb7e022ed9a /tmp/k7.o.after 87b14954045aa37dbaee6fb7e022ed9a /tmp/k7.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/k7.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index e633c9c..f390c9f 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -9,23 +9,23 @@ #include <linux/interrupt.h> #include <linux/smp.h> -#include <asm/processor.h> +#include <asm/processor.h> #include <asm/system.h> #include <asm/msr.h> #include "mce.h" /* Machine Check Handler For AMD Athlon/Duron */ -static void k7_machine_check(struct pt_regs * regs, long error_code) +static void k7_machine_check(struct pt_regs *regs, long error_code) { - int recover=1; + int recover = 1; u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; + recover = 0; printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); @@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code) } if (recover&2) - panic ("CPU context corrupt"); + panic("CPU context corrupt"); if (recover&1) - panic ("Unable to continue"); - printk (KERN_EMERG "Attempting to continue.\n"); + panic("Unable to continue"); + printk(KERN_EMERG "Attempting to continue.\n"); mcgstl &= ~(1<<2); - wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); + wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); } @@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = k7_machine_check; wmb(); - printk (KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr (MSR_IA32_MCG_CAP, l, h); + printk(KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ - wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff; /* Clear status for MC index 0 separately, we don't touch CTL, * as some K7 Athlons cause spurious MCEs when its enabled. */ if (boot_cpu_data.x86 == 6) { - wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); i = 1; } else i = 0; - for (; i<nr_mce_banks; i++) { - wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + for (; i < nr_mce_banks; i++) { + wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); } - set_in_cr4 (X86_CR4_MCE); - printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id()); } -- cgit v1.1 From f016e15c11ba8945ff8e1945445648722d6fadfb Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 14 Jun 2008 20:44:52 +0200 Subject: x86: coding style fixes to arch/x86/math-emu/reg_constant Before: total: 6 errors, 1 warnings, 117 lines checked After: total: 0 errors, 1 warnings, 117 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/reg_constant.o.* 780388a3056d58fb759efaf190d5d3d1 /tmp/reg_constant.o.after 780388a3056d58fb759efaf190d5d3d1 /tmp/reg_constant.o.before paolo@paolo-desktop:~/linux.trees.git$ size /tmp/reg_constant.o.* text data bss dec hex filename 457 0 0 457 1c9 /tmp/reg_constant.o.after 457 0 0 457 1c9 /tmp/reg_constant.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/math-emu/reg_constant.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 04869e6..0054835 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -16,8 +16,8 @@ #include "reg_constant.h" #include "control_w.h" -#define MAKE_REG(s,e,l,h) { l, h, \ - ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } +#define MAKE_REG(s, e, l, h) { l, h, \ + ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 @@ -40,7 +40,7 @@ FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); /* Only the sign and significand (and tag) are used in internal NaNs */ -/* The 80486 never generates one of these +/* The 80486 never generates one of these FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); */ /* This is the real indefinite QNaN */ @@ -49,7 +49,7 @@ FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); /* Only the sign (and tag) is used in internal infinities */ FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); -static void fld_const(FPU_REG const *c, int adj, u_char tag) +static void fld_const(FPU_REG const * c, int adj, u_char tag) { FPU_REG *st_new_ptr; -- cgit v1.1 From 219835f10eaef19d2b976076f2bc4ad806856c55 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sat, 14 Jun 2008 21:11:39 +0200 Subject: x86: coding style fixes to x86/kernel/cpu/cpufreq/cpufreq-nforce2.c Before: total: 22 errors, 8 warnings, 440 lines checked After: total: 0 errors, 8 warnings, 442 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/cpufreq-nforce2.o.* 3d4330a5d188fe904446e5948a618b48 /tmp/cpufreq-nforce2.o.after 1477e6b0dcd6f59b1fb6b4490042eca6 /tmp/cpufreq-nforce2.o.before ^^^ I guess this is because I fixed a few "do not initialise statics to 0 or NULL" paolo@paolo-desktop:~/linux.trees.git$ size /tmp/cpufreq-nforce2.o.* text data bss dec hex filename 1923 72 16 2011 7db /tmp/cpufreq-nforce2.o.after 1923 72 16 2011 7db /tmp/cpufreq-nforce2.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | 44 ++++++++++++++------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index f03e9153..965ea52 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -26,9 +26,10 @@ #define NFORCE2_SAFE_DISTANCE 50 /* Delay in ms between FSB changes */ -//#define NFORCE2_DELAY 10 +/* #define NFORCE2_DELAY 10 */ -/* nforce2_chipset: +/* + * nforce2_chipset: * FSB is changed using the chipset */ static struct pci_dev *nforce2_chipset_dev; @@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev; /* fid: * multiplier * 10 */ -static int fid = 0; +static int fid; /* min_fsb, max_fsb: * minimum and maximum FSB (= FSB at boot time) */ -static int min_fsb = 0; -static int max_fsb = 0; +static int min_fsb; +static int max_fsb; MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); @@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444); MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); MODULE_PARM_DESC(min_fsb, - "Minimum FSB to use, if not defined: current FSB - 50"); + "Minimum FSB to use, if not defined: current FSB - 50"); #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) @@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb) /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, - 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL); + 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL); if (!nforce2_sub5) return 0; @@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb) fsb /= 1000000; /* Check if PLL register is already set */ - pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); + pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - if(bootfsb || !temp) + if (bootfsb || !temp) return fsb; - + /* Use PLL register FSB value */ - pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp); + pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp); fsb = nforce2_calc_fsb(temp); return fsb; @@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb) } /* First write? Then set actual value */ - pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp); + pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp); if (!temp) { pll = nforce2_calc_pll(tfsb); @@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb) tfsb--; /* Calculate the PLL reg. value */ - if ((pll = nforce2_calc_pll(tfsb)) == -1) + pll = nforce2_calc_pll(tfsb); + if (pll == -1) return -EINVAL; nforce2_write_pll(pll); @@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu) static int nforce2_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { -// unsigned long flags; +/* unsigned long flags; */ struct cpufreq_freqs freqs; unsigned int target_fsb; @@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy, cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); /* Disable IRQs */ - //local_irq_save(flags); + /* local_irq_save(flags); */ if (nforce2_set_fsb(target_fsb) < 0) printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", - target_fsb); + target_fsb); else dprintk("Changed FSB successfully to %d\n", - target_fsb); + target_fsb); /* Enable IRQs */ - //local_irq_restore(flags); + /* local_irq_restore(flags); */ cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); @@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy) policy->max = (fsb_pol_max + 1) * fid * 100; cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); + policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); return 0; } @@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy) /* Set maximum FSB to FSB at boot time */ max_fsb = nforce2_fsb_read(1); - if(!max_fsb) + if (!max_fsb) return -EIO; if (!min_fsb) -- cgit v1.1 From b8e0418b2a25f975c3b8764030c24b7253d33a68 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Mon, 16 Jun 2008 19:59:08 -0300 Subject: x86: fix typo CONFIX -> CONFIG Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 326a8f4..fd680c7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -68,7 +68,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIX_X86_64) && defined(CONFIG_X86_MCE) +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; -- cgit v1.1 From dd0c7c4903c29da9aa3bf33deecf064d190a0d81 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Wed, 18 Jun 2008 15:38:57 +0200 Subject: x86: shrink pat_x_mtrr_type to its essentials Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ac3a2b1..227df3c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -161,29 +161,21 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ */ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) { - u8 mtrr_type; - - /* - * We return the PAT request directly for types where PAT takes - * precedence with respect to MTRR and for UC_MINUS. - * Consistency checks with other PAT requests is done later - * while going through memtype list. - */ - if (req_type == _PAGE_CACHE_WC || - req_type == _PAGE_CACHE_UC_MINUS || - req_type == _PAGE_CACHE_UC) - return req_type; - /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ - mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_UNCACHABLE) - return _PAGE_CACHE_UC; - if (mtrr_type == MTRR_TYPE_WRCOMB) - return _PAGE_CACHE_WC; - return _PAGE_CACHE_WB; + if (req_type == _PAGE_CACHE_WB) { + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == MTRR_TYPE_UNCACHABLE) + return _PAGE_CACHE_UC; + if (mtrr_type == MTRR_TYPE_WRCOMB) + return _PAGE_CACHE_WC; + } + + return req_type; } /* -- cgit v1.1 From 1a750e0cd7a30c478723ecfa1df685efcdd38a90 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Wed, 18 Jun 2008 21:03:26 -0700 Subject: x86, bitops: make constant-bit set/clear_bit ops faster On Wed, 18 Jun 2008, Linus Torvalds wrote: > > And yes, the "lock andl" should be noticeably faster than the xchgl. I dunno. Here's a untested (!!) patch that turns constant-bit set/clear_bit ops into byte mask ops (lock orb/andb). It's not exactly pretty. The reason for using the byte versions is that a locked op is serialized in the memory pipeline anyway, so there are no forwarding issues (that could slow down things when we access things with different sizes), and the byte ops are a lot smaller than 32-bit and particularly 64-bit ops (big constants, and the 64-bit ops need the REX prefix byte too). [ Side note: I wonder if we should turn the "test_bit()" C version into a "char *" version too.. It could actually help with alias analysis, since char pointers can alias anything. So it might be the RightThing(tm) to do for multiple reasons. I dunno. It's a separate issue. ] It does actually shrink the kernel image a bit (a couple of hundred bytes on the text segment for my everything-compiled-in image), and while it's totally untested the (admittedly few) code generation points I looked at seemed sane. And "lock orb" should be noticeably faster than "lock bts". If somebody wants to play with it, go wild. I didn't do "change_bit()", because nobody sane uses that thing anyway. I guarantee nothing. And if it breaks, nobody saw me do anything. You can't prove this email wasn't sent by somebody who is good at forging smtp. This does require a gcc that is recent enough for "__builtin_constant_p()" to work in an inline function, but I suspect our kernel requirements are already higher than that. And if you do have an old gcc that is supported, the worst that would happen is that the optimization doesn't trigger. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/bitops.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h index 7d2494b..ab7635a 100644 --- a/include/asm-x86/bitops.h +++ b/include/asm-x86/bitops.h @@ -23,11 +23,22 @@ #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc versions. */ -#define ADDR "=m" (*(volatile long *) addr) +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) #else -#define ADDR "+m" (*(volatile long *) addr) +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) #endif +#define ADDR BITOP_ADDR(addr) + +/* + * We do the locked ops that don't return the old value as + * a mask operation on a byte. + */ +#define IS_IMMEDIATE(nr) \ + (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR BITOP_ADDR(addr + (nr>>3)) +#define CONST_MASK (1 << (nr & 7)) + /** * set_bit - Atomically set a bit in memory * @nr: the bit to set @@ -43,11 +54,15 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void set_bit(int nr, volatile unsigned long *addr) +static inline void set_bit(unsigned int nr, volatile unsigned long *addr) { - asm volatile(LOCK_PREFIX "bts %1,%0" : ADDR : "Ir" (nr) : "memory"); + if (IS_IMMEDIATE(nr)) + asm volatile(LOCK_PREFIX "orb %1,%0" : CONST_MASK_ADDR : "i" (CONST_MASK) : "memory"); + else + asm volatile(LOCK_PREFIX "bts %1,%0" : ADDR : "Ir" (nr) : "memory"); } + /** * __set_bit - Set a bit in memory * @nr: the bit to set @@ -74,7 +89,10 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) */ static inline void clear_bit(int nr, volatile unsigned long *addr) { - asm volatile(LOCK_PREFIX "btr %1,%0" : ADDR : "Ir" (nr)); + if (IS_IMMEDIATE(nr)) + asm volatile(LOCK_PREFIX "andb %1,%0" : CONST_MASK_ADDR : "i" (~CONST_MASK)); + else + asm volatile(LOCK_PREFIX "btr %1,%0" : ADDR : "Ir" (nr)); } /* -- cgit v1.1 From 5f0120b5786f5dbe097a946a2eb5d745ebc2b7ed Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Wed, 18 Jun 2008 12:42:11 +0100 Subject: x86-64: remove unnecessary ptregs call stubs Signed-off-by: Jan Beulich <jbeulich@novell.com> Cc: "Andi Kleen" <andi@firstfloor.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/ia32/ia32entry.S | 6 ++---- arch/x86/kernel/entry_64.S | 1 - include/asm-x86/unistd_64.h | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b5e329d..3aefbce 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -370,13 +370,11 @@ quiet_ni_syscall: PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx PTREGSCALL stub32_execve, sys32_execve, %rcx PTREGSCALL stub32_fork, sys_fork, %rdi PTREGSCALL stub32_clone, sys32_clone, %rdx PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx ENTRY(ia32_ptregs_common) popq %r11 @@ -476,7 +474,7 @@ ia32_sys_call_table: .quad sys_ssetmask .quad sys_setreuid16 /* 70 */ .quad sys_setregid16 - .quad stub32_sigsuspend + .quad sys32_sigsuspend .quad compat_sys_sigpending .quad sys_sethostname .quad compat_sys_setrlimit /* 75 */ @@ -583,7 +581,7 @@ ia32_sys_call_table: .quad sys32_rt_sigpending .quad compat_sys_rt_sigtimedwait .quad sys32_rt_sigqueueinfo - .quad stub32_rt_sigsuspend + .quad sys_rt_sigsuspend .quad sys32_pread /* 180 */ .quad sys32_pwrite .quad sys_chown16 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1f1751d..5cf0aa9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -420,7 +420,6 @@ END(\label) PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx PTREGSCALL stub_iopl, sys_iopl, %rsi diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h index fe26e36..9c1a4a3 100644 --- a/include/asm-x86/unistd_64.h +++ b/include/asm-x86/unistd_64.h @@ -290,7 +290,7 @@ __SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) #define __NR_rt_sigqueueinfo 129 __SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) #define __NR_rt_sigsuspend 130 -__SYSCALL(__NR_rt_sigsuspend, stub_rt_sigsuspend) +__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) #define __NR_sigaltstack 131 __SYSCALL(__NR_sigaltstack, stub_sigaltstack) #define __NR_utime 132 -- cgit v1.1 From 7dbceaf9bb68919651901b101f44edd5391ee489 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 20 Jun 2008 07:28:24 +0200 Subject: x86, bitops: make constant-bit set/clear_bit ops faster, adapt, clean up fix integration bug introduced by "x86: bitops take an unsigned long *" which turned "(void *) + x" into "(long *) + x". small cleanups to make it more apparent which value get propagated where. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/bitops.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h index ab7635a..6c50548 100644 --- a/include/asm-x86/bitops.h +++ b/include/asm-x86/bitops.h @@ -28,16 +28,15 @@ #define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) #endif -#define ADDR BITOP_ADDR(addr) +#define ADDR BITOP_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ -#define IS_IMMEDIATE(nr) \ - (__builtin_constant_p(nr)) -#define CONST_MASK_ADDR BITOP_ADDR(addr + (nr>>3)) -#define CONST_MASK (1 << (nr & 7)) +#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK(nr) (1 << ((nr) & 7)) /** * set_bit - Atomically set a bit in memory @@ -56,13 +55,17 @@ */ static inline void set_bit(unsigned int nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) - asm volatile(LOCK_PREFIX "orb %1,%0" : CONST_MASK_ADDR : "i" (CONST_MASK) : "memory"); - else - asm volatile(LOCK_PREFIX "bts %1,%0" : ADDR : "Ir" (nr) : "memory"); + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "orb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "i" (CONST_MASK(nr)) + : "memory"); + } else { + asm volatile(LOCK_PREFIX "bts %1,%0" + : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + } } - /** * __set_bit - Set a bit in memory * @nr: the bit to set @@ -89,10 +92,15 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) */ static inline void clear_bit(int nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) - asm volatile(LOCK_PREFIX "andb %1,%0" : CONST_MASK_ADDR : "i" (~CONST_MASK)); - else - asm volatile(LOCK_PREFIX "btr %1,%0" : ADDR : "Ir" (nr)); + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "andb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "i" (~CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btr %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } } /* -- cgit v1.1 From 6673cf63e5d973db5145d1f48b354efcb9fe2a13 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata <yamahata@valinux.co.jp> Date: Mon, 16 Jun 2008 14:58:13 -0700 Subject: xen: Use wmb instead of rmb in xen_evtchn_do_upcall(). This patch is ported one from 534:77db69c38249 of linux-2.6.18-xen.hg. Use wmb instead of rmb to enforce ordering between evtchn_upcall_pending and evtchn_pending_sel stores in xen_evtchn_do_upcall(). Cc: Samuel Thibault <samuel.thibault@eu.citrix.com> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 73d78dc..332dd637 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ - rmb(); + wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); while (pending_words != 0) { -- cgit v1.1 From eb179e443deb0a5c81a62b4c157124a4b7ff1813 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 15:01:53 -0700 Subject: xen: mask unwanted pte bits in __supported_pte_mask [ Stable: this isn't a bugfix in itself, but it's a pre-requiste for "xen: don't drop NX bit" ] Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stable Kernel <stable@kernel.org> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/enlighten.c | 5 +++++ arch/x86/xen/mmu.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8e6152e..73fb0c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1280,6 +1280,11 @@ asmlinkage void __init xen_start_kernel(void) if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7c99358..f0078d9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -343,10 +343,8 @@ pgdval_t xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) { + if (pte & _PAGE_PRESENT) pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } return (pte_t){ .pte = pte }; } -- cgit v1.1 From a987b16cc6123af2c9414032701bab5f73c54c89 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 15:01:56 -0700 Subject: xen: don't drop NX bit Because NX is now enforced properly, we must put the hypercall page into the .text segment so that it is executable. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stable Kernel <stable@kernel.org> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/mmu.c | 54 +++++++++++++++++++++++++++---------------------- arch/x86/xen/xen-head.S | 2 +- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f0078d9..8132aa8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -323,46 +323,54 @@ out: preempt_enable(); } -pteval_t xen_pte_val(pte_t pte) +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) { - pteval_t ret = pte.pte; + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + } - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return val; +} - return ret; +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + } + + return val; +} + +pteval_t xen_pte_val(pte_t pte) +{ + return pte_mfn_to_pfn(pte.pte); } pgdval_t xen_pgd_val(pgd_t pgd) { - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pgd.pgd); } pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ .pte = pte }; + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } pgd_t xen_make_pgd(pgdval_t pgd) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } pmdval_t xen_pmd_val(pmd_t pmd) { - pmdval_t ret = native_pmd_val(pmd); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } void xen_set_pud_hyper(pud_t *ptr, pud_t val) @@ -421,9 +429,7 @@ void xen_pmd_clear(pmd_t *pmdp) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & _PAGE_PRESENT) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - + pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index ef6c9e0..7c0cf63 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -18,7 +18,7 @@ ENTRY(startup_xen) __FINIT -.pushsection .bss.page_aligned +.pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) .skip 0x1000 -- cgit v1.1 From 944256e00a5466ae6b7a11fdb3a47d092f2f62c1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 17 Jun 2008 11:41:49 -0700 Subject: x86: unify asm-x86/fixmap*.h Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/fixmap.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/fixmap_32.h | 42 ------------------------------------------ include/asm-x86/fixmap_64.h | 37 ------------------------------------- 3 files changed, 43 insertions(+), 79 deletions(-) diff --git a/include/asm-x86/fixmap.h b/include/asm-x86/fixmap.h index 5bd2069..01b0f98 100644 --- a/include/asm-x86/fixmap.h +++ b/include/asm-x86/fixmap.h @@ -7,7 +7,50 @@ # include "fixmap_64.h" #endif +extern void __set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) + +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + #define clear_fixmap(idx) \ __set_fixmap(idx, 0, __pgprot(0)) +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} #endif diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h index 4b96148..3a94115 100644 --- a/include/asm-x86/fixmap_32.h +++ b/include/asm-x86/fixmap_32.h @@ -109,17 +109,8 @@ enum fixed_addresses { __end_of_fixed_addresses }; -extern void __set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); extern void reserve_top_address(unsigned long reserve); -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) @@ -128,38 +119,5 @@ extern void reserve_top_address(unsigned long reserve); #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) #define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without tranlation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static __always_inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - -static inline unsigned long virt_to_fix(const unsigned long vaddr) -{ - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); - return __virt_to_fix(vaddr); -} - #endif /* !__ASSEMBLY__ */ #endif diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h index 355d26a..6260988 100644 --- a/include/asm-x86/fixmap_64.h +++ b/include/asm-x86/fixmap_64.h @@ -52,17 +52,6 @@ enum fixed_addresses { __end_of_fixed_addresses }; -extern void __set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - #define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) @@ -71,30 +60,4 @@ extern void __set_fixmap(enum fixed_addresses idx, #define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) #define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without translation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static __always_inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - #endif -- cgit v1.1 From 7c7e6e07e2a7c0d2d96389f4f0540e44a80ecdaa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 17 Jun 2008 11:41:54 -0700 Subject: x86: unify __set_fixmap In both cases, I went with the 32-bit behaviour. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 12 ------------ arch/x86/mm/pgtable.c | 14 ++++++++++++++ arch/x86/mm/pgtable_32.c | 14 +------------- include/asm-x86/fixmap.h | 2 ++ 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 156e6d7..e5f5319 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -213,18 +213,6 @@ void __init cleanup_highmap(void) } } -/* NOTE: this is meant to be run only at boot */ -void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk(KERN_ERR "Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, prot); -} - static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5015976..3ebebe4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -274,3 +274,17 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } + +int fixmaps_set; + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + fixmaps_set++; +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 369cf06..3f97c3c 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -145,18 +145,6 @@ static int fixmaps; unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - BUG(); - return; - } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); - fixmaps++; -} - /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve @@ -166,7 +154,7 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) */ void reserve_top_address(unsigned long reserve) { - BUG_ON(fixmaps > 0); + BUG_ON(fixmaps_set > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", (int)-reserve); __FIXADDR_TOP = -reserve - PAGE_SIZE; diff --git a/include/asm-x86/fixmap.h b/include/asm-x86/fixmap.h index 01b0f98..934d6b4 100644 --- a/include/asm-x86/fixmap.h +++ b/include/asm-x86/fixmap.h @@ -7,6 +7,8 @@ # include "fixmap_64.h" #endif +extern int fixmaps_set; + extern void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags); #define set_fixmap(idx, phys) \ -- cgit v1.1 From d494a96125c99f1e37b1f831b29b42c9b712ee05 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 17 Jun 2008 11:41:59 -0700 Subject: x86: implement set_pte_vaddr Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 9 ++++----- arch/x86/mm/pgtable.c | 2 +- arch/x86/mm/pgtable_32.c | 6 +++--- include/asm-x86/pgtable.h | 3 +++ 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e5f5319..74fae83 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -135,15 +135,15 @@ static __init void *spp_getpage(void) return ptr; } -static void -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) +void +set_pte_vaddr(unsigned long vaddr, pte_t new_pte) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, new_pte; + pte_t *pte; - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte)); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { @@ -170,7 +170,6 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) return; } } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); pte = pte_offset_kernel(pmd, vaddr); if (!pte_none(*pte) && pte_val(new_pte) && diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ebebe4..7498124 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -285,6 +285,6 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + set_pte_vaddr(address, pfn_pte(phys >> PAGE_SHIFT, flags)); fixmaps_set++; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 3f97c3c..0662f34 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -71,7 +71,7 @@ void show_mem(void) * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. */ -static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud; @@ -94,8 +94,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) return; } pte = pte_offset_kernel(pmd, vaddr); - if (pgprot_val(flags)) - set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); + if (pte_val(pteval)) + set_pte_present(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 97c271b..702f269 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -318,6 +318,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #endif +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ -- cgit v1.1 From aeaaa59c7e15dcfaaf57ce069ef81683067d575d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 17 Jun 2008 11:42:01 -0700 Subject: x86/paravirt/xen: add set_fixmap pv_mmu_ops Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/paravirt.c | 2 ++ arch/x86/mm/pgtable.c | 9 +++++++-- arch/x86/xen/enlighten.c | 29 +++++++++++++++++++++++++++++ include/asm-x86/fixmap.h | 14 ++++++++++++-- include/asm-x86/paravirt.h | 13 +++++++++++++ 5 files changed, 63 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 74f0c5e..cf06670 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -416,6 +416,8 @@ struct pv_mmu_ops pv_mmu_ops = { .enter = paravirt_nop, .leave = paravirt_nop, }, + + .set_fixmap = native_set_fixmap, }; EXPORT_SYMBOL_GPL(pv_time_ops); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7498124..e9fb663 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -277,7 +277,7 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, int fixmaps_set; -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); @@ -285,6 +285,11 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) BUG(); return; } - set_pte_vaddr(address, pfn_pte(phys >> PAGE_SHIFT, flags)); + set_pte_vaddr(address, pte); fixmaps_set++; } + +void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e4..0ad8a64 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -960,6 +960,33 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } +static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif + case FIX_WP_TEST: + case FIX_VDSO: +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ +#endif + pte = pfn_pte(phys, prot); + break; + + default: + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); +} + static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -1112,6 +1139,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .enter = paravirt_enter_lazy_mmu, .leave = xen_leave_lazy, }, + + .set_fixmap = xen_set_fixmap, }; #ifdef CONFIG_SMP diff --git a/include/asm-x86/fixmap.h b/include/asm-x86/fixmap.h index 934d6b4..44d4f82 100644 --- a/include/asm-x86/fixmap.h +++ b/include/asm-x86/fixmap.h @@ -9,8 +9,18 @@ extern int fixmaps_set; -extern void __set_fixmap(enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); +void native_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); + +#ifndef CONFIG_PARAVIRT +static inline void __set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + native_set_fixmap(idx, phys, flags); +} +#endif + #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 0f13b94..af85caf 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -273,6 +273,13 @@ struct pv_mmu_ops { #endif struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + unsigned long phys, pgprot_t flags); }; /* This contains all the paravirt structures: we get a convenient @@ -1252,6 +1259,12 @@ static inline void arch_flush_lazy_mmu_mode(void) } } +static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, + unsigned long phys, pgprot_t flags) +{ + pv_mmu_ops.set_fixmap(idx, phys, flags); +} + void _paravirt_nop(void); #define paravirt_nop ((void *)_paravirt_nop) -- cgit v1.1 From a1d5a8691f1b6c92491747bea3b778b184fa5837 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 20 Jun 2008 15:34:46 +0200 Subject: x86: unify __set_fixmap, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix build failure: arch/x86/mm/pgtable.c:280: warning: ‘enum fixed_addresses’ declared inside parameter list arch/x86/mm/pgtable.c:280: warning: its scope is only this definition or declaration, which is probably not what you want arch/x86/mm/pgtable.c:280: error: parameter 1 (‘idx’) has incomplete type Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pgtable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e9fb663..892fd18 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -2,6 +2,7 @@ #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> +#include <asm/fixmap.h> pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { -- cgit v1.1 From 437a0a54eea7b101e8a5b70688009956f6522ed0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 20 Jun 2008 21:50:20 +0200 Subject: x86, bitops: make constant-bit set/clear_bit ops faster, gcc workaround Jeremy Fitzhardinge reported this compiler bug: Suggestion from Linus: add "r" to the input constraint of the set_bit()/clear_bit()'s constant 'nr' branch: Blows up on "gcc version 3.4.4 20050314 (prerelease) (Debian 3.4.3-13)": CC init/main.o include2/asm/bitops.h: In function `start_kernel': include2/asm/bitops.h:59: warning: asm operand 1 probably doesn't match constraints include2/asm/bitops.h:59: warning: asm operand 1 probably doesn't match constraints include2/asm/bitops.h:59: warning: asm operand 1 probably doesn't match constraints include2/asm/bitops.h:59: error: impossible constraint in `asm' include2/asm/bitops.h:59: error: impossible constraint in `asm' include2/asm/bitops.h:59: error: impossible constraint in `asm' Reported-by: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/bitops.h b/include/asm-x86/bitops.h index 6c50548..96b1829 100644 --- a/include/asm-x86/bitops.h +++ b/include/asm-x86/bitops.h @@ -58,7 +58,7 @@ static inline void set_bit(unsigned int nr, volatile unsigned long *addr) if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" : CONST_MASK_ADDR(nr, addr) - : "i" (CONST_MASK(nr)) + : "iq" ((u8)CONST_MASK(nr)) : "memory"); } else { asm volatile(LOCK_PREFIX "bts %1,%0" @@ -95,7 +95,7 @@ static inline void clear_bit(int nr, volatile unsigned long *addr) if (IS_IMMEDIATE(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" : CONST_MASK_ADDR(nr, addr) - : "i" (~CONST_MASK(nr))); + : "iq" ((u8)~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX "btr %1,%0" : BITOP_ADDR(addr) -- cgit v1.1 From 0754557d72c1fbfc5fcfd5235e7c23ae6f77248c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 03:50:47 -0700 Subject: x86: change early_gart_iommu_check() back to any_mapped Kevin Winchester reported a GART related direct rendering failure against linux-next-20080611, which shows up via these log entries: PCI: Using ACPI for IRQ routing PCI: Cannot allocate resource region 0 of device 0000:00:00.0 agpgart: Detected AGP bridge 0 agpgart: Aperture conflicts with PCI mapping. agpgart: Aperture from AGP @ e0000000 size 128 MB agpgart: Aperture conflicts with PCI mapping. agpgart: No usable aperture found. agpgart: Consider rebooting with iommu=memaper=2 to get a good aperture. instead of the expected: PCI: Using ACPI for IRQ routing agpgart: Detected AGP bridge 0 agpgart: Aperture from AGP @ e0000000 size 128 MB Kevin bisected it down to this change in tip/x86/gart: "x86: checking aperture size order". agp check is using request_mem_region(), and could fail if e820 is reserved... change it back to e820_any_mapped(). Reported-and-bisected-by: "Kevin Winchester" <kjwinchester@gmail.com> Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Tested-by: Kevin Winchester <kjwinchester@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3409abb..e819362 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -324,8 +324,8 @@ void __init early_gart_iommu_check(void) fix = 1; if (gart_fix_e820 && !fix && aper_enabled) { - if (!e820_all_mapped(aper_base, aper_base + aper_size, - E820_RESERVED)) { + if (e820_any_mapped(aper_base, aper_base + aper_size, + E820_RAM)) { /* reserve it, so we can reuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); add_memory_region(aper_base, aper_size, E820_RESERVED); -- cgit v1.1 From 3da757daf86e498872855f0b5e101f763ba79499 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Fri, 20 Jun 2008 15:06:33 -0700 Subject: x86: use cpu_khz for loops_per_jiffy calculation On the x86 platform we can use the value of tsc_khz computed during tsc calibration to calculate the loops_per_jiffy value. Its very important to keep the error in lpj values to minimum as any error in that may result in kernel panic in check_timer. In virtualization environment, On a highly overloaded host the guest delay calibration may sometimes result in errors beyond the ~50% that timer_irq_works can handle, resulting in the guest panicking. Does some formating changes to lpj_setup code to now have a single printk to print the bogomips value. We do this only for the boot processor because the AP's can have different base frequencies or the BIOS might boot a AP at a different frequency. Signed-off-by: Alok N Kataria <akataria@vmware.com> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Daniel Hecht <dhecht@vmware.com> Cc: Tim Mann <mann@vmware.com> Cc: Zach Amsden <zach@vmware.com> Cc: Sahil Rihan <srihan@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/time_64.c | 2 ++ arch/x86/kernel/tsc_32.c | 5 +++++ include/linux/delay.h | 1 + init/calibrate.c | 36 +++++++++++++++++++----------------- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index c737849..12b4a71 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -123,6 +123,8 @@ void __init time_init(void) (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calculate_cpu_khz(); + lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ; + if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 068759d..be72903 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -401,6 +401,7 @@ static inline void check_geode_tsc_reliable(void) { } void __init tsc_init(void) { int cpu; + u64 lpj; if (!cpu_has_tsc || tsc_disabled) { /* Disable the TSC in case of !cpu_has_tsc */ @@ -421,6 +422,10 @@ void __init tsc_init(void) return; } + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_tsc = lpj; + printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); diff --git a/include/linux/delay.h b/include/linux/delay.h index 54552d2..01aec60 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -41,6 +41,7 @@ static inline void ndelay(unsigned long x) #define ndelay(x) ndelay(x) #endif +extern unsigned long lpj_tsc; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); diff --git a/init/calibrate.c b/init/calibrate.c index ecb3822..8628697 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -8,7 +8,9 @@ #include <linux/delay.h> #include <linux/init.h> #include <linux/timex.h> +#include <linux/smp.h> +unsigned long lpj_tsc; unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -108,6 +110,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little * better than 1% + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the tsc frequency. + * For the rest of the CPUs we cannot assume that the tsc frequency is same as + * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 @@ -118,20 +124,20 @@ void __cpuinit calibrate_delay(void) if (preset_lpj) { loops_per_jiffy = preset_lpj; - printk("Calibrating delay loop (skipped)... " - "%lu.%02lu BogoMIPS preset\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100); + printk(KERN_INFO + "Calibrating delay loop (skipped) preset value.. "); + } else if ((smp_processor_id() == 0) && lpj_tsc) { + loops_per_jiffy = lpj_tsc; + printk(KERN_INFO + "Calibrating delay loop (skipped), " + "using tsc calculated value.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk("Calibrating delay using timer specific routine.. "); - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); + printk(KERN_INFO + "Calibrating delay using timer specific routine.. "); } else { loops_per_jiffy = (1<<12); - printk(KERN_DEBUG "Calibrating delay loop... "); + printk(KERN_INFO "Calibrating delay loop... "); while ((loops_per_jiffy <<= 1) != 0) { /* wait for "start of" clock tick */ ticks = jiffies; @@ -161,12 +167,8 @@ void __cpuinit calibrate_delay(void) if (jiffies != ticks) /* longer than 1 tick */ loops_per_jiffy &= ~loopbit; } - - /* Round the value and print it */ - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); } - + printk(KERN_INFO "%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); } -- cgit v1.1 From 6ff10de374cc68ff2024247793176dc8a1b317ea Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 24 Jun 2008 01:19:49 +0200 Subject: x86: fix "x86: use cpu_khz for loops_per_jiffy calculation" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/tsc_32.c: In function ‘tsc_init': arch/x86/kernel/tsc_32.c:421: error: ‘lpj_tsc' undeclared (first use in this function) arch/x86/kernel/tsc_32.c:421: error: (Each undeclared identifier is reported only once arch/x86/kernel/tsc_32.c:421: error: for each function it appears in.) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tsc_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index be72903..0af49fb 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -1,6 +1,7 @@ #include <linux/sched.h> #include <linux/clocksource.h> #include <linux/workqueue.h> +#include <linux/delay.h> #include <linux/cpufreq.h> #include <linux/jiffies.h> #include <linux/init.h> -- cgit v1.1 From bcc643dc287cb732e96a1685ac130c3ae8b1c960 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 21:58:46 +0200 Subject: x86: introduce macro to check whether an address range is in the ISA range Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/pat.c | 5 ++--- include/asm-x86/e820.h | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7452eb3..6d9960d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -142,7 +142,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't remap the low PCI/ISA area, it's always mapped.. */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + if (is_ISA_range(phys_addr, last_addr)) return (__force void __iomem *)phys_to_virt(phys_addr); /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 227df3c..5bbc22e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -215,7 +215,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { + if (is_ISA_range(start, end - 1)) { if (ret_type) *ret_type = _PAGE_CACHE_WB; @@ -415,9 +415,8 @@ int free_memtype(u64 start, u64 end) } /* Low ISA region is always mapped WB. No need to track */ - if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { + if (is_ISA_range(start, end - 1)) return 0; - } spin_lock(&memtype_lock); list_for_each_entry(ml, &memtype_list, nd) { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 7004251..5103d0b 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -24,6 +24,7 @@ struct e820map { #define ISA_START_ADDRESS 0xa0000 #define ISA_END_ADDRESS 0x100000 +#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) #define BIOS_BEGIN 0x000a0000 #define BIOS_END 0x00100000 -- cgit v1.1 From ac97991ec9e0a05c8860f4a04f8477227b1c03f2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:01:49 +0200 Subject: x86: pat.c choose more crisp variable names - parse/ml => entry (within list_for_each and friends) - new_entry => new - ret_type => new_type (to avoid confusion with req_type) (... to make it more readable...) Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 127 ++++++++++++++++++++++++++---------------------------- 1 file changed, 62 insertions(+), 65 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 5bbc22e..a6507bf 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -188,37 +188,34 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) * req_type will have a special case value '-1', when requester want to inherit * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. * - * If ret_type is NULL, function will return an error if it cannot reserve the - * region with req_type. If ret_type is non-null, function will return - * available type in ret_type in case of no error. In case of any error + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *ret_type) + unsigned long *new_type) { - struct memtype *new_entry = NULL; - struct memtype *parse; + struct memtype *new, *entry; unsigned long actual_type; int err = 0; /* Only track when pat_enabled */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ - if (ret_type) { - if (req_type == -1) { - *ret_type = _PAGE_CACHE_WB; - } else { - *ret_type = req_type & _PAGE_CACHE_MASK; - } + if (new_type) { + if (req_type == -1) + *new_type = _PAGE_CACHE_WB; + else + *new_type = req_type & _PAGE_CACHE_MASK; } return 0; } /* Low ISA region is always mapped WB in page table. No need to track */ if (is_ISA_range(start, end - 1)) { - if (ret_type) - *ret_type = _PAGE_CACHE_WB; - + if (new_type) + *new_type = _PAGE_CACHE_WB; return 0; } @@ -243,65 +240,65 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = pat_x_mtrr_type(start, end, req_type); } - new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); - if (!new_entry) + new = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new) return -ENOMEM; - new_entry->start = start; - new_entry->end = end; - new_entry->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; - if (ret_type) - *ret_type = actual_type; + if (new_type) + *new_type = actual_type; spin_lock(&memtype_lock); /* Search for existing mapping that overlaps the current range */ - list_for_each_entry(parse, &memtype_list, nd) { + list_for_each_entry(entry, &memtype_list, nd) { struct memtype *saved_ptr; - if (parse->start >= end) { + if (entry->start >= end) { dprintk("New Entry\n"); - list_add(&new_entry->nd, parse->nd.prev); - new_entry = NULL; + list_add(&new->nd, entry->nd.prev); + new = NULL; break; } - if (start <= parse->start && end >= parse->start) { - if (actual_type != parse->type && ret_type) { - actual_type = parse->type; - *ret_type = actual_type; - new_entry->type = actual_type; + if (start <= entry->start && end >= entry->start) { + if (actual_type != entry->type && new_type) { + actual_type = entry->type; + *new_type = actual_type; + new->type = actual_type; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } - saved_ptr = parse; + saved_ptr = entry; /* * Check to see whether the request overlaps more * than one entry in the list */ - list_for_each_entry_continue(parse, &memtype_list, nd) { - if (end <= parse->start) { + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (end <= entry->start) { break; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } @@ -314,46 +311,46 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ - list_add(&new_entry->nd, saved_ptr->nd.prev); - new_entry = NULL; + list_add(&new->nd, saved_ptr->nd.prev); + new = NULL; break; } - if (start < parse->end) { - if (actual_type != parse->type && ret_type) { - actual_type = parse->type; - *ret_type = actual_type; - new_entry->type = actual_type; + if (start < entry->end) { + if (actual_type != entry->type && new_type) { + actual_type = entry->type; + *new_type = actual_type; + new->type = actual_type; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } - saved_ptr = parse; + saved_ptr = entry; /* * Check to see whether the request overlaps more * than one entry in the list */ - list_for_each_entry_continue(parse, &memtype_list, nd) { - if (end <= parse->start) { + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (end <= entry->start) { break; } - if (actual_type != parse->type) { + if (actual_type != entry->type) { printk( KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", current->comm, current->pid, start, end, cattr_name(actual_type), - cattr_name(parse->type)); + cattr_name(entry->type)); err = -EBUSY; break; } @@ -366,8 +363,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ - list_add(&new_entry->nd, &saved_ptr->nd); - new_entry = NULL; + list_add(&new->nd, &saved_ptr->nd); + new = NULL; break; } } @@ -375,24 +372,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (err) { printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(new_entry->type), + start, end, cattr_name(new->type), cattr_name(req_type)); - kfree(new_entry); + kfree(new); spin_unlock(&memtype_lock); return err; } - if (new_entry) { + if (new) { /* No conflict. Not yet added to the list. Add to the tail */ - list_add_tail(&new_entry->nd, &memtype_list); + list_add_tail(&new->nd, &memtype_list); dprintk("New Entry\n"); } - if (ret_type) { + if (new_type) { dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), - cattr_name(req_type), cattr_name(*ret_type)); + cattr_name(req_type), cattr_name(*new_type)); } else { dprintk( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", @@ -406,7 +403,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *ml; + struct memtype *entry; int err = -EINVAL; /* Only track when pat_enabled */ @@ -419,10 +416,10 @@ int free_memtype(u64 start, u64 end) return 0; spin_lock(&memtype_lock); - list_for_each_entry(ml, &memtype_list, nd) { - if (ml->start == start && ml->end == end) { - list_del(&ml->nd); - kfree(ml); + list_for_each_entry(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + list_del(&entry->nd); + kfree(entry); err = 0; break; } -- cgit v1.1 From 69e26be9b1d0c83d3581475095ce2a1ccc578215 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:03:06 +0200 Subject: x86: pat.c more trivial changes - add BUG statement to catch invalid start and end parameters - No need to track the actual type in both req_type and actual_type -- keep req_type unchanged. - removed (IMHO) superfluous comments Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index a6507bf..c996a36 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -200,7 +200,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; int err = 0; - /* Only track when pat_enabled */ + BUG_ON(start >= end); /* end is exclusive */ + if (!pat_enabled) { /* This is identical to page table setting without PAT */ if (new_type) { @@ -228,17 +229,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, */ u8 mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == MTRR_TYPE_WRBACK) { - req_type = _PAGE_CACHE_WB; + if (mtrr_type == MTRR_TYPE_WRBACK) actual_type = _PAGE_CACHE_WB; - } else { - req_type = _PAGE_CACHE_UC_MINUS; + else actual_type = _PAGE_CACHE_UC_MINUS; - } - } else { - req_type &= _PAGE_CACHE_MASK; - actual_type = pat_x_mtrr_type(start, end, req_type); - } + } else + actual_type = pat_x_mtrr_type(start, end, + req_type & _PAGE_CACHE_MASK); new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -406,10 +403,8 @@ int free_memtype(u64 start, u64 end) struct memtype *entry; int err = -EINVAL; - /* Only track when pat_enabled */ - if (!pat_enabled) { + if (!pat_enabled) return 0; - } /* Low ISA region is always mapped WB. No need to track */ if (is_ISA_range(start, end - 1)) -- cgit v1.1 From 3e9c83b309fd7cbf1d9b801d0d5877c040e30420 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:04:02 +0200 Subject: x86: pat.c consolidate error/debug messages in reserve_memtype ... and move last debug message out of locked section. Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 51 +++++++++++---------------------------------------- 1 file changed, 11 insertions(+), 40 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c996a36..1118288 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -269,12 +269,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -290,12 +284,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -321,12 +309,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -342,12 +324,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (actual_type != entry->type) { - printk( - KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, - start, end, - cattr_name(actual_type), - cattr_name(entry->type)); err = -EBUSY; break; } @@ -367,10 +343,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (err) { - printk(KERN_INFO - "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(new->type), - cattr_name(req_type)); + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, + end, cattr_name(new->type), cattr_name(entry->type)); + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " + "track %s, req %s\n", + start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); return err; @@ -382,19 +360,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("New Entry\n"); } - if (new_type) { - dprintk( - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(actual_type), - cattr_name(req_type), cattr_name(*new_type)); - } else { - dprintk( - "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", - start, end, cattr_name(actual_type), - cattr_name(req_type)); - } - spin_unlock(&memtype_lock); + + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + return err; } -- cgit v1.1 From f6887264deba4cd991f0ca006918dcff4c939021 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:05:37 +0200 Subject: x86: pat.c consolidate list_add handling in reserve_memtype Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1118288..49dcd96 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -198,6 +198,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, { struct memtype *new, *entry; unsigned long actual_type; + struct list_head *where; int err = 0; BUG_ON(start >= end); /* end is exclusive */ @@ -251,13 +252,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); /* Search for existing mapping that overlaps the current range */ + where = NULL; list_for_each_entry(entry, &memtype_list, nd) { struct memtype *saved_ptr; if (entry->start >= end) { - dprintk("New Entry\n"); - list_add(&new->nd, entry->nd.prev); - new = NULL; + where = entry->nd.prev; break; } @@ -295,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); - /* No conflict. Go ahead and add this new entry */ - list_add(&new->nd, saved_ptr->nd.prev); - new = NULL; + where = saved_ptr->nd.prev; break; } @@ -335,9 +333,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); - /* No conflict. Go ahead and add this new entry */ - list_add(&new->nd, &saved_ptr->nd); - new = NULL; + where = &saved_ptr->nd; break; } } @@ -354,11 +350,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - if (new) { - /* No conflict. Not yet added to the list. Add to the tail */ + if (where) + list_add(&new->nd, where); + else list_add_tail(&new->nd, &memtype_list); - dprintk("New Entry\n"); - } spin_unlock(&memtype_lock); -- cgit v1.1 From 64fe44c38bbdfab4fe052029058ce5fe9804de68 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:07:09 +0200 Subject: x86: pat.c introduce function to check for conflicts with existing memtypes ... to strip down loop body in reserve_memtype. Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 96 +++++++++++++++++++------------------------------------ 1 file changed, 33 insertions(+), 63 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 49dcd96..281ac64 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -178,6 +178,33 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } +static int chk_conflict(struct memtype *new, struct memtype *entry, + unsigned long *type) +{ + if (new->type != entry->type) { + if (type) { + new->type = entry->type; + *type = entry->type; + } else + goto conflict; + } + + /* check overlaps with more than one entry in the list */ + list_for_each_entry_continue(entry, &memtype_list, nd) { + if (new->end <= entry->start) + break; + else if (new->type != entry->type) + goto conflict; + } + return 0; + + conflict: + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, + new->end, cattr_name(new->type), cattr_name(entry->type)); + return -EBUSY; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -254,94 +281,37 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry(entry, &memtype_list, nd) { - struct memtype *saved_ptr; - if (entry->start >= end) { where = entry->nd.prev; break; } if (start <= entry->start && end >= entry->start) { - if (actual_type != entry->type && new_type) { - actual_type = entry->type; - *new_type = actual_type; - new->type = actual_type; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - - saved_ptr = entry; - /* - * Check to see whether the request overlaps more - * than one entry in the list - */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - break; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - } - + err = chk_conflict(new, entry, new_type); if (err) { break; } dprintk("Overlap at 0x%Lx-0x%Lx\n", - saved_ptr->start, saved_ptr->end); - where = saved_ptr->nd.prev; + entry->start, entry->end); + where = entry->nd.prev; break; } if (start < entry->end) { - if (actual_type != entry->type && new_type) { - actual_type = entry->type; - *new_type = actual_type; - new->type = actual_type; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - - saved_ptr = entry; - /* - * Check to see whether the request overlaps more - * than one entry in the list - */ - list_for_each_entry_continue(entry, &memtype_list, nd) { - if (end <= entry->start) { - break; - } - - if (actual_type != entry->type) { - err = -EBUSY; - break; - } - } - + err = chk_conflict(new, entry, new_type); if (err) { break; } dprintk("Overlap at 0x%Lx-0x%Lx\n", - saved_ptr->start, saved_ptr->end); - where = &saved_ptr->nd; + entry->start, entry->end); + where = &entry->nd; break; } } if (err) { - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, - end, cattr_name(new->type), cattr_name(entry->type)); printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " "track %s, req %s\n", start, end, cattr_name(new->type), cattr_name(req_type)); -- cgit v1.1 From 33af9039cbf629041da2bfa0cf451208391a1ec3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann <andreas.herrmann3@amd.com> Date: Fri, 20 Jun 2008 22:08:37 +0200 Subject: x86: pat.c final cleanup of loop body in reserve_memtype Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Suresh B Siddha <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pat.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 281ac64..a885a10 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -281,32 +281,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry(entry, &memtype_list, nd) { - if (entry->start >= end) { + if (end <= entry->start) { where = entry->nd.prev; break; - } - - if (start <= entry->start && end >= entry->start) { + } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); - if (err) { - break; + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + where = entry->nd.prev; } - - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = entry->nd.prev; break; - } - - if (start < entry->end) { + } else if (start < entry->end) { /* start > entry->start */ err = chk_conflict(new, entry, new_type); - if (err) { - break; + if (!err) { + dprintk("Overlap at 0x%Lx-0x%Lx\n", + entry->start, entry->end); + where = &entry->nd; } - - dprintk("Overlap at 0x%Lx-0x%Lx\n", - entry->start, entry->end); - where = &entry->nd; break; } } -- cgit v1.1 From 93e1ade5382206d597e9d6de2d1383e69f54d064 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Sun, 22 Jun 2008 09:40:18 +0200 Subject: x86/oprofile: disable preemption in nmi_shutdown fix: BUG: using smp_processor_id() in preemptible [00000000] code: oprofiled/27301 caller is nmi_shutdown+0x11/0x60 Pid: 27301, comm: oprofiled Not tainted 2.6.26-rc7 #25 [<c028a90d>] debug_smp_processor_id+0xbd/0xc0 [<c045fba1>] nmi_shutdown+0x11/0x60 [<c045dd4a>] oprofile_shutdown+0x2a/0x60 Note that we don't need this for the other functions, since they are all called with on_each_cpu() (which disables preemption for us anyway). Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Cc: Philippe Elie <phil.el@wanadoo.fr> Cc: oprofile-list@lists.sf.net Cc: Johannes Weiner <hannes@saeurebad.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/oprofile/nmi_int.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cc48d3f..2b6ad5b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -269,12 +269,13 @@ static void nmi_cpu_shutdown(void *dummy) static void nmi_shutdown(void) { - struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs *msrs = &get_cpu_var(cpu_msrs); nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); unregister_die_notifier(&profile_exceptions_nb); model->shutdown(msrs); free_msrs(); + put_cpu_var(cpu_msrs); } static void nmi_cpu_start(void *dummy) -- cgit v1.1 From f3f3149f35b9195ef4b761b1353fc0766b5f53be Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Mon, 23 Jun 2008 18:21:56 -0700 Subject: x86: use cpu_khz for loops_per_jiffy calculation, cleanup As suggested by Ingo, remove all references to tsc from init/calibrate.c TSC is x86 specific, and using tsc in variable names in a generic file should be avoided. lpj_tsc is now called lpj_fine, since it is related to fine tuning of lpj value. Also tsc_rate_* is called timer_rate_* Signed-off-by: Alok N Kataria <akataria@vmware.com> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Daniel Hecht <dhecht@vmware.com> Cc: Tim Mann <mann@vmware.com> Cc: Zach Amsden <zach@vmware.com> Cc: Sahil Rihan <srihan@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/time_64.c | 2 +- arch/x86/kernel/tsc_32.c | 2 +- include/linux/delay.h | 2 +- init/calibrate.c | 36 +++++++++++++++++++----------------- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 12b4a71..39ae851 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -123,7 +123,7 @@ void __init time_init(void) (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calculate_cpu_khz(); - lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ; + lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 0af49fb..048baab 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -425,7 +425,7 @@ void __init tsc_init(void) lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); - lpj_tsc = lpj; + lpj_fine = lpj; printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, diff --git a/include/linux/delay.h b/include/linux/delay.h index 01aec60..fd832c6 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -41,7 +41,7 @@ static inline void ndelay(unsigned long x) #define ndelay(x) ndelay(x) #endif -extern unsigned long lpj_tsc; +extern unsigned long lpj_fine; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); diff --git a/init/calibrate.c b/init/calibrate.c index 8628697..7963e3f 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -10,7 +10,7 @@ #include <linux/timex.h> #include <linux/smp.h> -unsigned long lpj_tsc; +unsigned long lpj_fine; unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -35,9 +35,9 @@ static unsigned long __cpuinit calibrate_delay_direct(void) unsigned long pre_start, start, post_start; unsigned long pre_end, end, post_end; unsigned long start_jiffies; - unsigned long tsc_rate_min, tsc_rate_max; - unsigned long good_tsc_sum = 0; - unsigned long good_tsc_count = 0; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; int i; if (read_current_timer(&pre_start) < 0 ) @@ -81,22 +81,24 @@ static unsigned long __cpuinit calibrate_delay_direct(void) } read_current_timer(&post_end); - tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS; - tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS; + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; /* - * If the upper limit and lower limit of the tsc_rate is + * If the upper limit and lower limit of the timer_rate is * >= 12.5% apart, redo calibration. */ if (pre_start != 0 && pre_end != 0 && - (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) { - good_tsc_count++; - good_tsc_sum += tsc_rate_max; + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; } } - if (good_tsc_count) - return (good_tsc_sum/good_tsc_count); + if (good_timer_count) + return (good_timer_sum/good_timer_count); printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); @@ -111,8 +113,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} * bit takes on average 1.5/HZ seconds. This (like the original) is a little * better than 1% * For the boot cpu we can skip the delay calibration and assign it a value - * calculated based on the tsc frequency. - * For the rest of the CPUs we cannot assume that the tsc frequency is same as + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 @@ -126,11 +128,11 @@ void __cpuinit calibrate_delay(void) loops_per_jiffy = preset_lpj; printk(KERN_INFO "Calibrating delay loop (skipped) preset value.. "); - } else if ((smp_processor_id() == 0) && lpj_tsc) { - loops_per_jiffy = lpj_tsc; + } else if ((smp_processor_id() == 0) && lpj_fine) { + loops_per_jiffy = lpj_fine; printk(KERN_INFO "Calibrating delay loop (skipped), " - "using tsc calculated value.. "); + "value calculated using timer frequency.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { printk(KERN_INFO "Calibrating delay using timer specific routine.. "); -- cgit v1.1 From b664d6bbeeddc77b93f5fea16006b428054f1cd1 Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Tue, 24 Jun 2008 12:31:05 +0200 Subject: x86: add X86_FEATURE_IBS cpu feature This adds IBS to the cpu feature flags allowing Perfmon and OProfile to use cpu_has(). Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/cpufeature.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 0d609c8..e2469a7 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -106,6 +106,7 @@ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ #define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_IBS (6*32+ 10) /* Instruction Based Sampling */ /* * Auxiliary flags: Linux defined - For features scattered in various -- cgit v1.1 From 378fc6eedc0091cc5ba65b298b8967bd2d43df5d Mon Sep 17 00:00:00 2001 From: WANG Cong <xiyou.wangcong@gmail.com> Date: Tue, 24 Jun 2008 16:21:18 +0100 Subject: x86: arch/x86/kernel/machine_kexec_32.c: remove extra semicolons I can't see any reason to keep these semicolons. Signed-off-by: WANG Cong <wangcong@zeuux.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/machine_kexec_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d0b234c..f496017 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -39,7 +39,7 @@ static void set_idt(void *newidt, __u16 limit) curidt.address = (unsigned long)newidt; load_idt(&curidt); -}; +} static void set_gdt(void *newgdt, __u16 limit) @@ -51,7 +51,7 @@ static void set_gdt(void *newgdt, __u16 limit) curgdt.address = (unsigned long)newgdt; load_gdt(&curgdt); -}; +} static void load_segments(void) { -- cgit v1.1 From 1ea0704e0da65b2b46f9142ff1391163aac24060 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 04:30:00 -0700 Subject: mm: add a ptep_modify_prot transaction abstraction This patch adds an API for doing read-modify-write updates to a pte's protection bits which may race against hardware updates to the pte. After reading the pte, the hardware may asynchonously set the accessed or dirty bits on a pte, which would be lost when writing back the modified pte value. The existing technique to handle this race is to use ptep_get_and_clear() atomically fetch the old pte value and clear it in memory. This has the effect of marking the pte as non-present, which will prevent the hardware from updating its state. When the new value is written back, the pte will be present again, and the hardware can resume updating the access/dirty flags. When running in a virtualized environment, pagetable updates are relatively expensive, since they generally involve some trap into the hypervisor. To mitigate the cost of these updates, we tend to batch them. However, because of the atomic nature of ptep_get_and_clear(), it is inherently non-batchable. This new interface allows batching by giving the underlying implementation enough information to open a transaction between the read and write phases: ptep_modify_prot_start() returns the current pte value, and puts the pte entry into a state where either the hardware will not update the pte, or if it does, the updates will be preserved on commit. ptep_modify_prot_commit() writes back the updated pte, makes sure that any hardware updates made since ptep_modify_prot_start() are preserved. ptep_modify_prot_start() and _commit() must be exactly paired, and used while holding the appropriate pte lock. They do not protect against other software updates of the pte in any way. The current implementations of ptep_modify_prot_start and _commit are functionally unchanged from before: _start() uses ptep_get_and_clear() fetch the pte and zero the entry, preventing any hardware updates. _commit() simply writes the new pte value back knowing that the hardware has not updated the pte in the meantime. The only current user of this interface is mprotect Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-generic/pgtable.h | 57 +++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 10 +++----- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 44ef329..4fce3db 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -197,6 +197,63 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) } #endif /* CONFIG_MMU */ +static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + /* + * Get the current pte state, but zero it out to make it + * non-present, preventing the hardware from asynchronously + * updating it. + */ + return ptep_get_and_clear(mm, addr, ptep); +} + +static inline void __ptep_modify_prot_commit(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * The pte is non-present, so there's no hardware state to + * preserve. + */ + set_pte_at(mm, addr, ptep, pte); +} + +#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +/* + * Start a pte protection read-modify-write transaction, which + * protects against asynchronous hardware modifications to the pte. + * The intention is not to prevent the hardware from making pte + * updates, but to prevent any updates it may make from being lost. + * + * This does not protect against other software modifications of the + * pte; the appropriate pte lock must be held over the transation. + * + * Note that this interface is intended to be batchable, meaning that + * ptep_modify_prot_commit may not actually update the pte, but merely + * queue the update to be done at some later time. The update must be + * actually committed before the pte lock is released, however. + */ +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + return __ptep_modify_prot_start(mm, addr, ptep); +} + +/* + * Commit an update to a pte, leaving any hardware-controlled bits in + * the PTE unmodified. + */ +static inline void ptep_modify_prot_commit(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pte_t pte) +{ + __ptep_modify_prot_commit(mm, addr, ptep, pte); +} +#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode diff --git a/mm/mprotect.c b/mm/mprotect.c index a5bf31c..acfe7c8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -47,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, if (pte_present(oldpte)) { pte_t ptent; - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - ptent = ptep_get_and_clear(mm, addr, pte); + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + /* * Avoid taking write faults for pages we know to be * dirty. */ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); - set_pte_at(mm, addr, pte, ptent); + + ptep_modify_prot_commit(mm, addr, pte, ptent); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.1 From 08b882c627aeeeb3cfd3c4354f0d360d7949549d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 04:30:01 -0700 Subject: paravirt: add hooks for ptep_modify_prot_start/commit This patch adds paravirt-ops hooks in pv_mmu_ops for ptep_modify_prot_start and ptep_modify_prot_commit. This allows the hypervisor-specific backends to implement these in some more efficient way. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/paravirt.c | 3 +++ arch/x86/xen/enlighten.c | 3 +++ include/asm-x86/paravirt.h | 28 ++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c98d546..f1ab0f7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -380,6 +380,9 @@ struct pv_mmu_ops pv_mmu_ops = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = kmap_atomic, #endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73fb0c4..0b7553c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1138,6 +1138,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + .pte_val = xen_pte_val, .pte_flags = native_pte_val, .pgd_val = xen_pgd_val, diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 5ea37a4..e9ada31 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -238,6 +238,11 @@ struct pv_mmu_ops { void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + pteval_t (*pte_val)(pte_t); pteval_t (*pte_flags)(pte_t); pte_t (*make_pte)(pteval_t pte); @@ -1039,6 +1044,29 @@ static inline pgdval_t pgd_val(pgd_t pgd) return ret; } +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pteval_t ret; + + ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start, + mm, addr, ptep); + + return (pte_t) { .pte = ret }; +} + +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (sizeof(pteval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte); + else + PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit, + mm, addr, ptep, pte.pte); +} + static inline void set_pte(pte_t *ptep, pte_t pte) { if (sizeof(pteval_t) > sizeof(long)) -- cgit v1.1 From e57778a1e30470c9f5b79e370511b9af29b59c48 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 04:30:02 -0700 Subject: xen: implement ptep_modify_prot_start/commit Xen has a pte update function which will update a pte while preserving its accessed and dirty bits. This means that ptep_modify_prot_start() can be implemented as a simple read of the pte value. The hardware may update the pte in the meantime, but ptep_modify_prot_commit() updates it while preserving any changes that may have happened in the meantime. The updates in ptep_modify_prot_commit() are batched if we're currently in lazy mmu mode. The mmu_update hypercall can take a batch of updates to perform, but this code doesn't make particular use of that feature, in favour of using generic multicall batching to get them all into the hypervisor. The net effect of this is that each mprotect pte update turns from two expensive trap-and-emulate faults into they hypervisor into a single hypercall whose cost is amortized in a batched multicall. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/enlighten.c | 13 ++++++++++--- arch/x86/xen/mmu.c | 21 +++++++++++++++++++++ arch/x86/xen/mmu.h | 4 ++++ include/xen/interface/features.h | 3 +++ include/xen/interface/xen.h | 9 +++++++-- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0b7553c..bd74229 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -168,7 +168,9 @@ static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); + printk(KERN_INFO "Hypervisor signature: %s%s\n", + xen_start_info->magic, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } static void xen_cpuid(unsigned int *ax, unsigned int *bx, @@ -1243,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void) BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); + xen_setup_features(); + /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -1252,14 +1256,17 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; + } + machine_ops = xen_machine_ops; #ifdef CONFIG_SMP smp_ops = xen_smp_ops; #endif - xen_setup_features(); - /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) xen_build_dynamic_phys_to_machine(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 8132aa8..846dad7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -323,6 +323,27 @@ out: preempt_enable(); } +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + /* Just return the pte as-is. We preserve the bits on commit */ + return *ptep; +} + +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u->val = pte_val_ma(pte); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + /* Assume pteval_t is equivalent to all the other *val_t types. */ static pteval_t pte_mfn_to_pfn(pteval_t val) { diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index e3dd09e..297bf9f 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -52,4 +52,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + #endif /* _XEN_MMU_H */ diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h index d73228d..f51b641 100644 --- a/include/xen/interface/features.h +++ b/include/xen/interface/features.h @@ -38,6 +38,9 @@ */ #define XENFEAT_pae_pgdir_above_4gb 4 +/* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ +#define XENFEAT_mmu_pt_update_preserve_ad 5 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 819a033..2befa3e 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -114,9 +114,14 @@ * ptr[:2] -- Machine address within the frame whose mapping to modify. * The frame must belong to the FD, if one is specified. * val -- Value to write into the mapping entry. + * + * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: + * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed + * with those in @val. */ -#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ -#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ +#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ +#define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */ /* * MMU EXTENDED OPERATIONS -- cgit v1.1 From 400d34944c4ad82a817c06e570bc93b1114aa596 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 04:30:03 -0700 Subject: xen: add mechanism to extend existing multicalls Some Xen hypercalls accept an array of operations to work on. In general this is because its more efficient for the hypercall to the work all at once rather than as separate hypercalls (even batched as a multicall). This patch adds a mechanism (xen_mc_extend_args()) to allocate more argument space to the last-issued multicall, in order to extend its argument list. The user of this mechanism is xen/mmu.c, which uses it to extend the args array of mmu_update. This is particularly valuable when doing the update for a large mprotect, which goes via ptep_modify_prot_commit(), but it also manages to batch updates to pgd/pmds as well. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/mmu.c | 55 ++++++++++++++++++++++++++++++----------------- arch/x86/xen/multicalls.c | 40 ++++++++++++++++++++++++++++------ arch/x86/xen/multicalls.h | 12 +++++++++++ 3 files changed, 81 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 846dad7..f6b8225 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -230,18 +230,35 @@ static bool page_pinned(void *ptr) return PagePinned(page); } -void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +static void extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; - preempt_disable(); + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); + + if (mcs.mc != NULL) + mcs.mc->args[1]++; + else { + mcs = __xen_mc_entry(sizeof(*u)); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + } - mcs = xen_mc_entry(sizeof(*u)); u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pmd_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + *u = *update; +} + +void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -332,14 +349,13 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - struct multicall_space mcs; - struct mmu_update *u; + struct mmu_update u; - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; - u->val = pte_val_ma(pte); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.val = pte_val_ma(pte); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -396,16 +412,15 @@ pmdval_t xen_pmd_val(pmd_t pmd) void xen_set_pud_hyper(pud_t *ptr, pud_t val) { - struct multicall_space mcs; - struct mmu_update *u; + struct mmu_update u; preempt_disable(); - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pud_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 5791eb2..3c63c4d 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -29,14 +29,14 @@ #define MC_DEBUG 1 #define MC_BATCH 32 -#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) +#define MC_ARGS (MC_BATCH * 16) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG struct multicall_entry debug[MC_BATCH]; #endif - u64 args[MC_ARGS]; + unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); void *data; @@ -107,20 +107,48 @@ struct multicall_space __xen_mc_entry(size_t args) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct multicall_space ret; - unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + unsigned argidx = roundup(b->argidx, sizeof(u64)); BUG_ON(preemptible()); - BUG_ON(argspace > MC_ARGS); + BUG_ON(b->argidx > MC_ARGS); if (b->mcidx == MC_BATCH || - (b->argidx + argspace) > MC_ARGS) + (argidx + args) > MC_ARGS) { xen_mc_flush(); + argidx = roundup(b->argidx, sizeof(u64)); + } ret.mc = &b->entries[b->mcidx]; b->mcidx++; + ret.args = &b->args[argidx]; + b->argidx = argidx + args; + + BUG_ON(b->argidx > MC_ARGS); + return ret; +} + +struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_space ret = { NULL, NULL }; + + BUG_ON(preemptible()); + BUG_ON(b->argidx > MC_ARGS); + + if (b->mcidx == 0) + return ret; + + if (b->entries[b->mcidx - 1].op != op) + return ret; + + if ((b->argidx + size) > MC_ARGS) + return ret; + + ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; - b->argidx += argspace; + b->argidx += size; + BUG_ON(b->argidx > MC_ARGS); return ret; } diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 8bae996..8589382 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode) /* Set up a callback to be called when the current batch is flushed */ void xen_mc_callback(void (*fn)(void *), void *data); +/* + * Try to extend the arguments of the previous multicall command. The + * previous command's op must match. If it does, then it attempts to + * extend the argument space allocated to the multicall entry by + * arg_size bytes. + * + * The returned multicall_space will return with mc pointing to the + * command on success, or NULL on failure, and args pointing to the + * newly allocated space. + */ +struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); + #endif /* _XEN_MULTICALLS_H */ -- cgit v1.1 From fa3d319ac6f648dc51cb672a31f482c80a8c2457 Mon Sep 17 00:00:00 2001 From: Pavel Machek <pavel@suse.cz> Date: Thu, 26 Jun 2008 00:25:43 +0200 Subject: pci-gart_64.c: could we get better explanation? Add better explanation to pci-gart. Signed-off-by: Pavel Machek <pavel@suse.cz> Cc: Dave Jones <davej@codemonkey.org.uk> Cc: Andi Kleen <andi@firstfloor.org> Cc: Trivial patch monkey <trivial@kernel.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/pci-gart_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f505c38..f20c20a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -824,10 +824,10 @@ void __init gart_iommu_init(void) wbinvd(); /* - * Try to workaround a bug (thanks to BenH) + * Try to workaround a bug (thanks to BenH): * Set unmapped entries to a scratch page instead of 0. * Any prefetches that hit unmapped entries won't get an bus abort - * then. + * then. (P2P bridge may be prefetching on DMA reads). */ scratch = get_zeroed_page(GFP_KERNEL); if (!scratch) -- cgit v1.1 From 2b188723ee1707ca902ddb98ce1decdeafb5190a Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:37 +0200 Subject: x86, AMD IOMMU: add Kconfig entry This patch adds the Kconfig entry for the AMD IOMMU driver. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e0edaaa..5a82f18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -549,6 +549,13 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT Calgary anyway, pass 'iommu=calgary' on the kernel command line. If unsure, say Y. +config AMD_IOMMU + bool "AMD IOMMU support" + select SWIOTL + depends on X86_64 && PCI + help + Select this to get support for AMD IOMMU hardware in your system. + # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool -- cgit v1.1 From 8d283c35a293e6091fdf7ef86842c1174c48a941 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:38 +0200 Subject: x86, AMD IOMMU: add header file for driver data structures and defines This patch adds a header file local to the AMD IOMMU driver with constants and data structures needed in the code. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/amd_iommu_types.h | 242 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 include/asm-x86/amd_iommu_types.h diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h new file mode 100644 index 0000000..0f39550 --- /dev/null +++ b/include/asm-x86/amd_iommu_types.h @@ -0,0 +1,242 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __AMD_IOMMU_TYPES_H__ +#define __AMD_IOMMU_TYPES_H__ + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +/* + * some size calculation constants + */ +#define DEV_TABLE_ENTRY_SIZE 256 +#define ALIAS_TABLE_ENTRY_SIZE 2 +#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) + +/* helper macros */ +#define LOW_U32(x) ((x) & ((1ULL << 32)-1)) +#define HIGH_U32(x) (LOW_U32((x) >> 32)) + +/* Length of the MMIO region for the AMD IOMMU */ +#define MMIO_REGION_LENGTH 0x4000 + +/* Capability offsets used by the driver */ +#define MMIO_CAP_HDR_OFFSET 0x00 +#define MMIO_RANGE_OFFSET 0x0c + +/* Masks, shifts and macros to parse the device range capability */ +#define MMIO_RANGE_LD_MASK 0xff000000 +#define MMIO_RANGE_FD_MASK 0x00ff0000 +#define MMIO_RANGE_BUS_MASK 0x0000ff00 +#define MMIO_RANGE_LD_SHIFT 24 +#define MMIO_RANGE_FD_SHIFT 16 +#define MMIO_RANGE_BUS_SHIFT 8 +#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) +#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) +#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) + +/* Flag masks for the AMD IOMMU exclusion range */ +#define MMIO_EXCL_ENABLE_MASK 0x01ULL +#define MMIO_EXCL_ALLOW_MASK 0x02ULL + +/* Used offsets into the MMIO space */ +#define MMIO_DEV_TABLE_OFFSET 0x0000 +#define MMIO_CMD_BUF_OFFSET 0x0008 +#define MMIO_EVT_BUF_OFFSET 0x0010 +#define MMIO_CONTROL_OFFSET 0x0018 +#define MMIO_EXCL_BASE_OFFSET 0x0020 +#define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_CMD_HEAD_OFFSET 0x2000 +#define MMIO_CMD_TAIL_OFFSET 0x2008 +#define MMIO_EVT_HEAD_OFFSET 0x2010 +#define MMIO_EVT_TAIL_OFFSET 0x2018 +#define MMIO_STATUS_OFFSET 0x2020 + +/* feature control bits */ +#define CONTROL_IOMMU_EN 0x00ULL +#define CONTROL_HT_TUN_EN 0x01ULL +#define CONTROL_EVT_LOG_EN 0x02ULL +#define CONTROL_EVT_INT_EN 0x03ULL +#define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_PASSPW_EN 0x08ULL +#define CONTROL_RESPASSPW_EN 0x09ULL +#define CONTROL_COHERENT_EN 0x0aULL +#define CONTROL_ISOC_EN 0x0bULL +#define CONTROL_CMDBUF_EN 0x0cULL +#define CONTROL_PPFLOG_EN 0x0dULL +#define CONTROL_PPFINT_EN 0x0eULL + +/* command specific defines */ +#define CMD_COMPL_WAIT 0x01 +#define CMD_INV_DEV_ENTRY 0x02 +#define CMD_INV_IOMMU_PAGES 0x03 + +#define CMD_COMPL_WAIT_STORE_MASK 0x01 +#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 +#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 + +/* macros and definitions for device table entries */ +#define DEV_ENTRY_VALID 0x00 +#define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_IR 0x3d +#define DEV_ENTRY_IW 0x3e +#define DEV_ENTRY_EX 0x67 +#define DEV_ENTRY_SYSMGT1 0x68 +#define DEV_ENTRY_SYSMGT2 0x69 +#define DEV_ENTRY_INIT_PASS 0xb8 +#define DEV_ENTRY_EINT_PASS 0xb9 +#define DEV_ENTRY_NMI_PASS 0xba +#define DEV_ENTRY_LINT0_PASS 0xbe +#define DEV_ENTRY_LINT1_PASS 0xbf + +/* constants to configure the command buffer */ +#define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_ENTRIES 512 +#define MMIO_CMD_SIZE_SHIFT 56 +#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) + +#define PAGE_MODE_1_LEVEL 0x01 +#define PAGE_MODE_2_LEVEL 0x02 +#define PAGE_MODE_3_LEVEL 0x03 + +#define IOMMU_PDE_NL_0 0x000ULL +#define IOMMU_PDE_NL_1 0x200ULL +#define IOMMU_PDE_NL_2 0x400ULL +#define IOMMU_PDE_NL_3 0x600ULL + +#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) +#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) +#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) + +#define IOMMU_MAP_SIZE_L1 (1ULL << 21) +#define IOMMU_MAP_SIZE_L2 (1ULL << 30) +#define IOMMU_MAP_SIZE_L3 (1ULL << 39) + +#define IOMMU_PTE_P (1ULL << 0) +#define IOMMU_PTE_U (1ULL << 59) +#define IOMMU_PTE_FC (1ULL << 60) +#define IOMMU_PTE_IR (1ULL << 61) +#define IOMMU_PTE_IW (1ULL << 62) + +#define IOMMU_L1_PDE(address) \ + ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define IOMMU_L2_PDE(address) \ + ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) + +#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) +#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) +#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) +#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) + +#define IOMMU_PROT_MASK 0x03 +#define IOMMU_PROT_IR 0x01 +#define IOMMU_PROT_IW 0x02 + +/* IOMMU capabilities */ +#define IOMMU_CAP_IOTLB 24 +#define IOMMU_CAP_NPCACHE 26 + +#define MAX_DOMAIN_ID 65536 + +struct protection_domain { + spinlock_t lock; + u16 id; + int mode; + u64 *pt_root; + void *priv; +}; + +struct dma_ops_domain { + struct list_head list; + struct protection_domain domain; + unsigned long aperture_size; + unsigned long next_bit; + unsigned long *bitmap; + u64 **pte_pages; +}; + +struct amd_iommu { + struct list_head list; + spinlock_t lock; + + u16 devid; + u16 cap_ptr; + + u64 mmio_phys; + u8 *mmio_base; + u32 cap; + u16 first_device; + u16 last_device; + u64 exclusion_start; + u64 exclusion_length; + + u8 *cmd_buf; + u32 cmd_buf_size; + + int need_sync; + + struct dma_ops_domain *default_dom; +}; + +extern struct list_head amd_iommu_list; + +struct dev_table_entry { + u32 data[8]; +}; + +struct unity_map_entry { + struct list_head list; + u16 devid_start; + u16 devid_end; + u64 address_start; + u64 address_end; + int prot; +}; + +extern struct list_head amd_iommu_unity_map; + +/* data structures for device handling */ +extern struct dev_table_entry *amd_iommu_dev_table; +extern u16 *amd_iommu_alias_table; +extern struct amd_iommu **amd_iommu_rlookup_table; + +extern unsigned amd_iommu_aperture_order; + +extern u16 amd_iommu_last_bdf; + +/* data structures for protection domain handling */ +extern struct protection_domain **amd_iommu_pd_table; +extern unsigned long *amd_iommu_pd_alloc_bitmap; + +extern int amd_iommu_isolate; + +static inline void print_devid(u16 devid, int nl) +{ + int bus = devid >> 8; + int dev = devid >> 3 & 0x1f; + int fn = devid & 0x07; + + printk("%02x:%02x.%x", bus, dev, fn); + if (nl) + printk("\n"); +} + +#endif -- cgit v1.1 From f6e2e6b6fc465bc3cc4eae8d53fcf573ca1cfa14 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:39 +0200 Subject: x86, AMD IOMMU: add defines and structures for ACPI scanning code This patch adds the required data structures and constants required to parse the ACPI table for the AMD IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 arch/x86/kernel/amd_iommu_init.c diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c new file mode 100644 index 0000000..6fce5ab --- /dev/null +++ b/arch/x86/kernel/amd_iommu_init.c @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/gfp.h> +#include <linux/list.h> +#include <asm/pci-direct.h> +#include <asm/amd_iommu_types.h> +#include <asm/gart.h> + +/* + * definitions for the ACPI scanning code + */ +#define UPDATE_LAST_BDF(x) do {\ + if ((x) > amd_iommu_last_bdf) \ + amd_iommu_last_bdf = (x); \ + } while (0); + +#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) +#define PCI_BUS(x) (((x) >> 8) & 0xff) +#define IVRS_HEADER_LENGTH 48 +#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) + +#define ACPI_IVHD_TYPE 0x10 +#define ACPI_IVMD_TYPE_ALL 0x20 +#define ACPI_IVMD_TYPE 0x21 +#define ACPI_IVMD_TYPE_RANGE 0x22 + +#define IVHD_DEV_ALL 0x01 +#define IVHD_DEV_SELECT 0x02 +#define IVHD_DEV_SELECT_RANGE_START 0x03 +#define IVHD_DEV_RANGE_END 0x04 +#define IVHD_DEV_ALIAS 0x42 +#define IVHD_DEV_ALIAS_RANGE 0x43 +#define IVHD_DEV_EXT_SELECT 0x46 +#define IVHD_DEV_EXT_SELECT_RANGE 0x47 + +#define IVHD_FLAG_HT_TUN_EN 0x00 +#define IVHD_FLAG_PASSPW_EN 0x01 +#define IVHD_FLAG_RESPASSPW_EN 0x02 +#define IVHD_FLAG_ISOC_EN 0x03 + +#define IVMD_FLAG_EXCL_RANGE 0x08 +#define IVMD_FLAG_UNITY_MAP 0x01 + +#define ACPI_DEVFLAG_INITPASS 0x01 +#define ACPI_DEVFLAG_EXTINT 0x02 +#define ACPI_DEVFLAG_NMI 0x04 +#define ACPI_DEVFLAG_SYSMGT1 0x10 +#define ACPI_DEVFLAG_SYSMGT2 0x20 +#define ACPI_DEVFLAG_LINT0 0x40 +#define ACPI_DEVFLAG_LINT1 0x80 +#define ACPI_DEVFLAG_ATSDIS 0x10000000 + +struct ivhd_header { + u8 type; + u8 flags; + u16 length; + u16 devid; + u16 cap_ptr; + u64 mmio_phys; + u16 pci_seg; + u16 info; + u32 reserved; +} __attribute__((packed)); + +struct ivhd_entry { + u8 type; + u16 devid; + u8 flags; + u32 ext; +} __attribute__((packed)); + +struct ivmd_header { + u8 type; + u8 flags; + u16 length; + u16 devid; + u16 aux; + u64 resv; + u64 range_start; + u64 range_length; +} __attribute__((packed)); + -- cgit v1.1 From 928abd2545fe367ea3ff3cb8a5076e1d6d2a9574 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:40 +0200 Subject: x86, AMD IOMMU: add data structures to manage the IOMMUs in the system This patch adds the data structures which will contain the information read from the ACPI table. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6fce5ab..0ad8cf9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -99,3 +99,20 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +static int __initdata amd_iommu_disable; + +u16 amd_iommu_last_bdf; +struct list_head amd_iommu_unity_map; +unsigned amd_iommu_aperture_order = 26; +int amd_iommu_isolate; + +struct list_head amd_iommu_list; +struct dev_table_entry *amd_iommu_dev_table; +u16 *amd_iommu_alias_table; +struct amd_iommu **amd_iommu_rlookup_table; +struct protection_domain **amd_iommu_pd_table; +unsigned long *amd_iommu_pd_alloc_bitmap; + +static u32 dev_table_size; +static u32 alias_table_size; +static u32 rlookup_table_size; -- cgit v1.1 From 3e8064ba59128bcb1405079a0789b27b356832b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:41 +0200 Subject: x86, AMD IOMMU: add functions to find last possible PCI device for IOMMU This patch adds functions to find the last PCI bus/device/function the IOMMU driver has to handle. This information is used later to allocate the memory for the data structures. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 79 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0ad8cf9..ee0b2da 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -116,3 +116,82 @@ unsigned long *amd_iommu_pd_alloc_bitmap; static u32 dev_table_size; static u32 alias_table_size; static u32 rlookup_table_size; + +static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) +{ + u32 cap; + + cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); + UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + + return 0; +} + +static int __init find_last_devid_from_ivhd(struct ivhd_header *h) +{ + u8 *p = (void *)h, *end = (void *)h; + struct ivhd_entry *dev; + + p += sizeof(*h); + end += h->length; + + find_last_devid_on_pci(PCI_BUS(h->devid), + PCI_SLOT(h->devid), + PCI_FUNC(h->devid), + h->cap_ptr); + + while (p < end) { + dev = (struct ivhd_entry *)p; + switch (dev->type) { + case IVHD_DEV_SELECT: + case IVHD_DEV_RANGE_END: + case IVHD_DEV_ALIAS: + case IVHD_DEV_EXT_SELECT: + UPDATE_LAST_BDF(dev->devid); + break; + default: + break; + } + p += 0x04 << (*p >> 6); + } + + WARN_ON(p != end); + + return 0; +} + +static int __init find_last_devid_acpi(struct acpi_table_header *table) +{ + int i; + u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table; + struct ivhd_header *h; + + /* + * Validate checksum here so we don't need to do it when + * we actually parse the table + */ + for (i = 0; i < table->length; ++i) + checksum += p[i]; + if (checksum != 0) + /* ACPI table corrupt */ + return -ENODEV; + + p += IVRS_HEADER_LENGTH; + + end += table->length; + while (p < end) { + h = (struct ivhd_header *)p; + switch (h->type) { + case ACPI_IVHD_TYPE: + find_last_devid_from_ivhd(h); + break; + default: + break; + } + p += h->length; + } + WARN_ON(p != end); + + return 0; +} + -- cgit v1.1 From ca7ed057ae25e5e60814f950995f22f051d2e449 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:42 +0200 Subject: x86, AMD IOMMU: add amd_iommu_init.c to Makefile This patch adds the source file amd_iommu_init.c to the kernel Makefile for the x86 architecture. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77807d4..1e4e00a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o -- cgit v1.1 From 6c56747b46717b4c6a890b35e8518f4be961dc7e Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:43 +0200 Subject: x86, AMD IOMMU: add functions for mapping/unmapping the MMIO space This patch contains two functions to map and unmap the MMIO region of an IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ee0b2da..3147e69 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -117,6 +117,29 @@ static u32 dev_table_size; static u32 alias_table_size; static u32 rlookup_table_size; +static u8 * __init iommu_map_mmio_space(u64 address) +{ + u8 *ret; + + if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) + return NULL; + + ret = ioremap_nocache(address, MMIO_REGION_LENGTH); + if (ret != NULL) + return ret; + + release_mem_region(address, MMIO_REGION_LENGTH); + + return NULL; +} + +static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) +{ + if (iommu->mmio_base) + iounmap(iommu->mmio_base); + release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); +} + static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; -- cgit v1.1 From b2026aa2dce4454950ccd9c410790f310d65696a Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:44 +0200 Subject: x86, AMD IOMMU: add functions for programming IOMMU MMIO space This patch adds the functions required to programm the IOMMU with the MMIO space. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3147e69..ffb8ac8 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -117,6 +117,66 @@ static u32 dev_table_size; static u32 alias_table_size; static u32 rlookup_table_size; +static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +{ + u64 start = iommu->exclusion_start & PAGE_MASK; + u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; + u64 entry; + + if (!iommu->exclusion_start) + return; + + entry = start | MMIO_EXCL_ENABLE_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET, + &entry, sizeof(entry)); + + entry = limit; + memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET, + &entry, sizeof(entry)); +} + +static void __init iommu_set_device_table(struct amd_iommu *iommu) +{ + u32 entry; + + BUG_ON(iommu->mmio_base == NULL); + + entry = virt_to_phys(amd_iommu_dev_table); + entry |= (dev_table_size >> 12) - 1; + memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, + &entry, sizeof(entry)); +} + +static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +{ + u32 ctrl; + + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl |= (1 << bit); + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + +static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +{ + u32 ctrl; + + ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl &= ~(1 << bit); + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + +void __init iommu_enable(struct amd_iommu *iommu) +{ + u32 ctrl; + + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); + print_devid(iommu->devid, 0); + printk(" cap 0x%hx\n", iommu->cap_ptr); + + iommu_feature_enable(iommu, CONTROL_IOMMU_EN); + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; -- cgit v1.1 From b36ca91e1d2d7e846844820784d57d20ad73dbd8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:45 +0200 Subject: x86, AMD IOMMU: add command buffer (de-)allocation This patch adds the functions to allocate and deallocate the command buffer for one IOMMU in the system. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ffb8ac8..c2be3ad 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -278,3 +278,33 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) +{ + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + get_order(CMD_BUFFER_SIZE)); + u64 entry = 0; + + if (cmd_buf == NULL) + return NULL; + + iommu->cmd_buf_size = CMD_BUFFER_SIZE; + + memset(cmd_buf, 0, CMD_BUFFER_SIZE); + + entry = (u64)virt_to_phys(cmd_buf); + entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, + &entry, sizeof(entry)); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + + return cmd_buf; +} + +static void __init free_command_buffer(struct amd_iommu *iommu) +{ + if (iommu->cmd_buf) + free_pages((unsigned long)iommu->cmd_buf, + get_order(CMD_BUFFER_SIZE)); +} + -- cgit v1.1 From 3566b7786afd7c14c62726f359df3c827054670b Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:46 +0200 Subject: x86, AMD IOMMU: add device table initialization functions This patch adds functions necessary to initialize the device table from the ACPI definitions. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 45 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c2be3ad..4c37abb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -308,3 +308,48 @@ static void __init free_command_buffer(struct amd_iommu *iommu) get_order(CMD_BUFFER_SIZE)); } +static void set_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + amd_iommu_dev_table[devid].data[i] |= (1 << _bit); +} + +static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +{ + if (flags & ACPI_DEVFLAG_INITPASS) + set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); + if (flags & ACPI_DEVFLAG_EXTINT) + set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS); + if (flags & ACPI_DEVFLAG_NMI) + set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS); + if (flags & ACPI_DEVFLAG_SYSMGT1) + set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1); + if (flags & ACPI_DEVFLAG_SYSMGT2) + set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2); + if (flags & ACPI_DEVFLAG_LINT0) + set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); + if (flags & ACPI_DEVFLAG_LINT1) + set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); +} + +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + +static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) +{ + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) + return; + + if (iommu) { + set_dev_entry_bit(m->devid, DEV_ENTRY_EX); + iommu->exclusion_start = m->range_start; + iommu->exclusion_length = m->range_length; + } +} + -- cgit v1.1 From 5d0c8e49f88b908a8fcc913c4b9843108ae8897b Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:47 +0200 Subject: x86, AMD IOMMU: add functions for IOMMU hardware initialization from ACPI This patch adds functions to initialize the IOMMU hardware with information from ACPI and PCI. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 125 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 4c37abb..8ec48f1 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -353,3 +353,128 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) } } +static void __init init_iommu_from_pci(struct amd_iommu *iommu) +{ + int bus = PCI_BUS(iommu->devid); + int dev = PCI_SLOT(iommu->devid); + int fn = PCI_FUNC(iommu->devid); + int cap_ptr = iommu->cap_ptr; + u32 range; + + iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); + + range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); + iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); + iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); +} + +static void __init init_iommu_from_acpi(struct amd_iommu *iommu, + struct ivhd_header *h) +{ + u8 *p = (u8 *)h; + u8 *end = p, flags = 0; + u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; + u32 ext_flags = 0; + bool alias = 0; + struct ivhd_entry *e; + + /* + * First set the recommended feature enable bits from ACPI + * into the IOMMU control registers + */ + h->flags & IVHD_FLAG_HT_TUN_EN ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + h->flags & IVHD_FLAG_PASSPW_EN ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + h->flags & IVHD_FLAG_RESPASSPW_EN ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + h->flags & IVHD_FLAG_ISOC_EN ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + + /* + * Done. Now parse the device entries + */ + p += sizeof(struct ivhd_header); + end += h->length; + + while (p < end) { + e = (struct ivhd_entry *)p; + switch (e->type) { + case IVHD_DEV_ALL: + for (dev_i = iommu->first_device; + dev_i <= iommu->last_device; ++dev_i) + set_dev_entry_from_acpi(dev_i, e->flags, 0); + break; + case IVHD_DEV_SELECT: + devid = e->devid; + set_dev_entry_from_acpi(devid, e->flags, 0); + break; + case IVHD_DEV_SELECT_RANGE_START: + devid_start = e->devid; + flags = e->flags; + ext_flags = 0; + alias = 0; + break; + case IVHD_DEV_ALIAS: + devid = e->devid; + devid_to = e->ext >> 8; + set_dev_entry_from_acpi(devid, e->flags, 0); + amd_iommu_alias_table[devid] = devid_to; + break; + case IVHD_DEV_ALIAS_RANGE: + devid_start = e->devid; + flags = e->flags; + devid_to = e->ext >> 8; + ext_flags = 0; + alias = 1; + break; + case IVHD_DEV_EXT_SELECT: + devid = e->devid; + set_dev_entry_from_acpi(devid, e->flags, e->ext); + break; + case IVHD_DEV_EXT_SELECT_RANGE: + devid_start = e->devid; + flags = e->flags; + ext_flags = e->ext; + alias = 0; + break; + case IVHD_DEV_RANGE_END: + devid = e->devid; + for (dev_i = devid_start; dev_i <= devid; ++dev_i) { + if (alias) + amd_iommu_alias_table[dev_i] = devid_to; + set_dev_entry_from_acpi( + amd_iommu_alias_table[dev_i], + flags, ext_flags); + } + break; + default: + break; + } + + p += 0x04 << (e->type >> 6); + } +} + +static int __init init_iommu_devices(struct amd_iommu *iommu) +{ + u16 i; + + for (i = iommu->first_device; i <= iommu->last_device; ++i) + set_iommu_for_device(iommu, i); + + return 0; +} + -- cgit v1.1 From e47d402d2df89bb1aa77b7573597a9dd2241aafe Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:48 +0200 Subject: x86, AMD IOMMU: add detect code for AMD IOMMU hardware This patch adds the detection of AMD IOMMU hardware provided on information from ACPI provided by the BIOS. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 78 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8ec48f1..3f4f7b8 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -478,3 +478,81 @@ static int __init init_iommu_devices(struct amd_iommu *iommu) return 0; } +static void __init free_iommu_one(struct amd_iommu *iommu) +{ + free_command_buffer(iommu); + iommu_unmap_mmio_space(iommu); +} + +static void __init free_iommu_all(void) +{ + struct amd_iommu *iommu, *next; + + list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + list_del(&iommu->list); + free_iommu_one(iommu); + kfree(iommu); + } +} + +static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) +{ + spin_lock_init(&iommu->lock); + list_add_tail(&iommu->list, &amd_iommu_list); + + /* + * Copy data from ACPI table entry to the iommu struct + */ + iommu->devid = h->devid; + iommu->cap_ptr = h->cap_ptr; + iommu->mmio_phys = h->mmio_phys; + iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); + if (!iommu->mmio_base) + return -ENOMEM; + + iommu_set_device_table(iommu); + iommu->cmd_buf = alloc_command_buffer(iommu); + if (!iommu->cmd_buf) + return -ENOMEM; + + init_iommu_from_pci(iommu); + init_iommu_from_acpi(iommu, h); + init_iommu_devices(iommu); + + return 0; +} + +static int __init init_iommu_all(struct acpi_table_header *table) +{ + u8 *p = (u8 *)table, *end = (u8 *)table; + struct ivhd_header *h; + struct amd_iommu *iommu; + int ret; + + INIT_LIST_HEAD(&amd_iommu_list); + + end += table->length; + p += IVRS_HEADER_LENGTH; + + while (p < end) { + h = (struct ivhd_header *)p; + switch (*p) { + case ACPI_IVHD_TYPE: + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); + if (iommu == NULL) + return -ENOMEM; + ret = init_iommu_one(iommu, h); + if (ret) + return ret; + break; + default: + break; + } + p += h->length; + + } + WARN_ON(p != end); + + return 0; +} + -- cgit v1.1 From be2a022c0dd0f630b06f83de284df53cb60a308f Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:49 +0200 Subject: x86, AMD IOMMU: add functions to parse IOMMU memory mapping requirements for devices This patch adds the functions to parse the information about IOMMU exclusion ranges and required unity mappings for the devices handled by the IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3f4f7b8..555fcc9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -556,3 +556,90 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +static void __init free_unity_maps(void) +{ + struct unity_map_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int __init init_exclusion_range(struct ivmd_header *m) +{ + int i; + + switch (m->type) { + case ACPI_IVMD_TYPE: + set_device_exclusion_range(m->devid, m); + break; + case ACPI_IVMD_TYPE_ALL: + for (i = 0; i < amd_iommu_last_bdf; ++i) + set_device_exclusion_range(i, m); + break; + case ACPI_IVMD_TYPE_RANGE: + for (i = m->devid; i <= m->aux; ++i) + set_device_exclusion_range(i, m); + break; + default: + break; + } + + return 0; +} + +static int __init init_unity_map_range(struct ivmd_header *m) +{ + struct unity_map_entry *e = 0; + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (e == NULL) + return -ENOMEM; + + switch (m->type) { + default: + case ACPI_IVMD_TYPE: + e->devid_start = e->devid_end = m->devid; + break; + case ACPI_IVMD_TYPE_ALL: + e->devid_start = 0; + e->devid_end = amd_iommu_last_bdf; + break; + case ACPI_IVMD_TYPE_RANGE: + e->devid_start = m->devid; + e->devid_end = m->aux; + break; + } + e->address_start = PAGE_ALIGN(m->range_start); + e->address_end = e->address_start + PAGE_ALIGN(m->range_length); + e->prot = m->flags >> 1; + + list_add_tail(&e->list, &amd_iommu_unity_map); + + return 0; +} + +static int __init init_memory_definitions(struct acpi_table_header *table) +{ + u8 *p = (u8 *)table, *end = (u8 *)table; + struct ivmd_header *m; + + INIT_LIST_HEAD(&amd_iommu_unity_map); + + end += table->length; + p += IVRS_HEADER_LENGTH; + + while (p < end) { + m = (struct ivmd_header *)p; + if (m->flags & IVMD_FLAG_EXCL_RANGE) + init_exclusion_range(m); + else if (m->flags & IVMD_FLAG_UNITY_MAP) + init_unity_map_range(m); + + p += m->length; + } + + return 0; +} + -- cgit v1.1 From fe74c9cf3985e307e9734296d08a270d510e3fb7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:50 +0200 Subject: x86, AMD IOMMU: clue initialization code together This patch puts the AMD IOMMU ACPI table parsing and hardware initialization functions together to the main intialization routine. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 126 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 555fcc9..c792ddc 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -643,3 +643,129 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +int __init amd_iommu_init(void) +{ + int i, ret = 0; + + + if (amd_iommu_disable) { + printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + return 0; + } + + /* + * First parse ACPI tables to find the largest Bus/Dev/Func + * we need to handle. Upon this information the shared data + * structures for the IOMMUs in the system will be allocated + */ + if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) + return -ENODEV; + + dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); + alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + + ret = -ENOMEM; + + /* Device table - directly used by all IOMMUs */ + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(dev_table_size)); + if (amd_iommu_dev_table == NULL) + goto out; + + /* + * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the + * IOMMU see for that device + */ + amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(alias_table_size)); + if (amd_iommu_alias_table == NULL) + goto free; + + /* IOMMU rlookup table - find the IOMMU for a specific device */ + amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(rlookup_table_size)); + if (amd_iommu_rlookup_table == NULL) + goto free; + + /* + * Protection Domain table - maps devices to protection domains + * This table has the same size as the rlookup_table + */ + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(rlookup_table_size)); + if (amd_iommu_pd_table == NULL) + goto free; + + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + get_order(MAX_DOMAIN_ID/8)); + if (amd_iommu_pd_alloc_bitmap == NULL) + goto free; + + /* + * memory is allocated now; initialize the device table with all zeroes + * and let all alias entries point to itself + */ + memset(amd_iommu_dev_table, 0, dev_table_size); + for (i = 0; i < amd_iommu_last_bdf; ++i) + amd_iommu_alias_table[i] = i; + + memset(amd_iommu_pd_table, 0, rlookup_table_size); + memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); + + /* + * never allocate domain 0 because its used as the non-allocated and + * error value placeholder + */ + amd_iommu_pd_alloc_bitmap[0] = 1; + + /* + * now the data structures are allocated and basically initialized + * start the real acpi table scan + */ + ret = -ENODEV; + if (acpi_table_parse("IVRS", init_iommu_all) != 0) + goto free; + + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) + goto free; + + printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", + (1 << (amd_iommu_aperture_order-20))); + + printk(KERN_INFO "AMD IOMMU: device isolation "); + if (amd_iommu_isolate) + printk("enabled\n"); + else + printk("disabled\n"); + +out: + return ret; + +free: + if (amd_iommu_pd_alloc_bitmap) + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + + if (amd_iommu_pd_table) + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); + + if (amd_iommu_rlookup_table) + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); + + if (amd_iommu_alias_table) + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); + + if (amd_iommu_dev_table) + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); + + free_iommu_all(); + + free_unity_maps(); + + goto out; +} + -- cgit v1.1 From ae7877de9cd8da075a46d5f615cb9fc9858e4b2f Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:51 +0200 Subject: x86, AMD IOMMU: add early detection code This patch adds code to detect an AMD IOMMU early during boot before the early detection code of other IOMMUs run. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c792ddc..d7a75bf 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -769,3 +769,23 @@ free: goto out; } +static int __init early_amd_iommu_detect(struct acpi_table_header *table) +{ + return 0; +} + +void __init amd_iommu_detect(void) +{ + if (swiotlb || no_iommu || iommu_detected) + return; + + if (amd_iommu_disable) + return; + + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { + iommu_detected = 1; + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; + } +} + -- cgit v1.1 From 918ad6c54d107ccfc4ccaa3a54b97437467fb58a Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:52 +0200 Subject: x86, AMD IOMMU: add kernel command line parameters for AMD IOMMU This patch adds two parameters to the kernel command line to control behavior of the AMD IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index d7a75bf..ec6f13b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -789,3 +789,37 @@ void __init amd_iommu_detect(void) } } +static int __init parse_amd_iommu_options(char *str) +{ + for (; *str; ++str) { + if (strcmp(str, "off") == 0) + amd_iommu_disable = 1; + if (strcmp(str, "isolate") == 0) + amd_iommu_isolate = 1; + } + + return 1; +} + +static int __init parse_amd_iommu_size_options(char *str) +{ + for (; *str; ++str) { + if (strcmp(str, "32M") == 0) + amd_iommu_aperture_order = 25; + if (strcmp(str, "64M") == 0) + amd_iommu_aperture_order = 26; + if (strcmp(str, "128M") == 0) + amd_iommu_aperture_order = 27; + if (strcmp(str, "256M") == 0) + amd_iommu_aperture_order = 28; + if (strcmp(str, "512M") == 0) + amd_iommu_aperture_order = 29; + if (strcmp(str, "1G") == 0) + amd_iommu_aperture_order = 30; + } + + return 1; +} + +__setup("amd_iommu=", parse_amd_iommu_options); +__setup("amd_iommu_size=", parse_amd_iommu_size_options); -- cgit v1.1 From b6c02715cff8c96174022166bfa3e1a25436fb34 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:53 +0200 Subject: x86, AMD IOMMU: add generic defines and structures for mapping code This patch adds generic stuff used by the mapping and unmapping code for the AMD IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 arch/x86/kernel/amd_iommu.c diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c new file mode 100644 index 0000000..90392c7 --- /dev/null +++ b/arch/x86/kernel/amd_iommu.c @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/pci.h> +#include <linux/gfp.h> +#include <linux/bitops.h> +#include <linux/scatterlist.h> +#include <linux/iommu-helper.h> +#include <asm/proto.h> +#include <asm/gart.h> +#include <asm/amd_iommu_types.h> + +#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) + +#define to_pages(addr, size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + +static DEFINE_RWLOCK(amd_iommu_devtable_lock); + +struct command { + u32 data[4]; +}; + + -- cgit v1.1 From 000fca2dfcfbd2859fed1208c38c899ab4873049 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:54 +0200 Subject: x86, AMD IOMMU: add amd_iommu.c to Makefile This patch adds the new amd_iommu.c file to the architecture kernel Makefile for x86. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1e4e00a..b34e0bb 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,7 +100,7 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o - obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o + obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o -- cgit v1.1 From a19ae1eccfb2d97f4704b1a2b3d1d9905845dcac Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:55 +0200 Subject: x86, AMD IOMMU: add functions to send IOMMU commands This patch adds generic handling function as well as all functions to send specific commands to the IOMMU hardware as required by this driver. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 106 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 90392c7..a24ee4a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -37,4 +37,110 @@ struct command { u32 data[4]; }; +static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +{ + u32 tail, head; + u8 *target; + + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + target = (iommu->cmd_buf + tail); + memcpy_toio(target, cmd, sizeof(*cmd)); + tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + if (tail == head) + return -ENOMEM; + writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + return 0; +} + +static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&iommu->lock, flags); + ret = __iommu_queue_command(iommu, cmd); + spin_unlock_irqrestore(&iommu->lock, flags); + + return ret; +} + +static int iommu_completion_wait(struct amd_iommu *iommu) +{ + int ret; + struct command cmd; + volatile u64 ready = 0; + unsigned long ready_phys = virt_to_phys(&ready); + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; + cmd.data[1] = HIGH_U32(ready_phys); + cmd.data[2] = 1; /* value written to 'ready' */ + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + iommu->need_sync = 0; + + ret = iommu_queue_command(iommu, &cmd); + + if (ret) + return ret; + + while (!ready) + cpu_relax(); + + return 0; +} + +static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) +{ + struct command cmd; + + BUG_ON(iommu == NULL); + + memset(&cmd, 0, sizeof(cmd)); + CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); + cmd.data[0] = devid; + + iommu->need_sync = 1; + + return iommu_queue_command(iommu, &cmd); +} + +static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, + u64 address, u16 domid, int pde, int s) +{ + struct command cmd; + + memset(&cmd, 0, sizeof(cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); + cmd.data[1] |= domid; + cmd.data[2] = LOW_U32(address); + cmd.data[3] = HIGH_U32(address); + if (s) + cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) + cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + + iommu->need_sync = 1; + + return iommu_queue_command(iommu, &cmd); +} + +static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, + u64 address, size_t size) +{ + int i; + unsigned pages = to_pages(address, size); + + address &= PAGE_MASK; + + for (i = 0; i < pages; ++i) { + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 0); + address += PAGE_SIZE; + } + + return 0; +} -- cgit v1.1 From bd0e521158af407ec816aea070831d4ca7ae65e9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:56 +0200 Subject: x86, AMD IOMMU: add functions to initialize unity mappings This patch adds the functions which will initialize the unity mappings in the device page tables. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 122 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a24ee4a..1d70f5e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -37,6 +37,9 @@ struct command { u32 data[4]; }; +static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, + struct unity_map_entry *e); + static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { u32 tail, head; @@ -144,3 +147,122 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +static int iommu_map(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) +{ + u64 __pte, *pte, *page; + + bus_addr = PAGE_ALIGN(bus_addr); + phys_addr = PAGE_ALIGN(bus_addr); + + /* only support 512GB address spaces for now */ + if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + return -EINVAL; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + + if (IOMMU_PTE_PRESENT(*pte)) + return -EBUSY; + + __pte = phys_addr | IOMMU_PTE_P; + if (prot & IOMMU_PROT_IR) + __pte |= IOMMU_PTE_IR; + if (prot & IOMMU_PROT_IW) + __pte |= IOMMU_PTE_IW; + + *pte = __pte; + + return 0; +} + +static int iommu_for_unity_map(struct amd_iommu *iommu, + struct unity_map_entry *entry) +{ + u16 bdf, i; + + for (i = entry->devid_start; i <= entry->devid_end; ++i) { + bdf = amd_iommu_alias_table[i]; + if (amd_iommu_rlookup_table[bdf] == iommu) + return 1; + } + + return 0; +} + +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + +static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, + struct unity_map_entry *e) +{ + u64 addr; + int ret; + + for (addr = e->address_start; addr < e->address_end; + addr += PAGE_SIZE) { + ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + if (ret) + return ret; + /* + * if unity mapping is in aperture range mark the page + * as allocated in the aperture + */ + if (addr < dma_dom->aperture_size) + __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + } + + return 0; +} + +static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, + u16 devid) +{ + struct unity_map_entry *e; + int ret; + + list_for_each_entry(e, &amd_iommu_unity_map, list) { + if (!(devid >= e->devid_start && devid <= e->devid_end)) + continue; + ret = dma_ops_unity_map(dma_dom, e); + if (ret) + return ret; + } + + return 0; +} + -- cgit v1.1 From d3086444b2988659a50af09462261c78d3012cb4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:57 +0200 Subject: x86, AMD IOMMU: add address allocation and deallocation functions This patch adds the address allocator to the AMD IOMMU code. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 1d70f5e..69d8d02 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -266,3 +266,51 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } +static unsigned long dma_mask_to_pages(unsigned long mask) +{ + return (mask >> PAGE_SHIFT) + + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); +} + +static unsigned long dma_ops_alloc_addresses(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages) +{ + unsigned long limit = dma_mask_to_pages(*dev->dma_mask); + unsigned long address; + unsigned long size = dom->aperture_size >> PAGE_SHIFT; + unsigned long boundary_size; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + limit = limit < size ? limit : size; + + if (dom->next_bit >= limit) + dom->next_bit = 0; + + address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, + 0 , boundary_size, 0); + if (address == -1) + address = iommu_area_alloc(dom->bitmap, limit, 0, pages, + 0, boundary_size, 0); + + if (likely(address != -1)) { + set_bit_string(dom->bitmap, address, pages); + dom->next_bit = address + pages; + address <<= PAGE_SHIFT; + } else + address = bad_dma_address; + + WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + + return address; +} + +static void dma_ops_free_addresses(struct dma_ops_domain *dom, + unsigned long address, + unsigned int pages) +{ + address >>= PAGE_SHIFT; + iommu_area_free(dom->bitmap, address, pages); +} + -- cgit v1.1 From ec487d1a110abfffebdf116560db5c1d71b94cab Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:58 +0200 Subject: x86, AMD IOMMU: add domain allocation and deallocation functions This patch adds the functions to allocate and free protection domains for the IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 147 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 69d8d02..c43d15d 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -314,3 +314,150 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +static u16 domain_id_alloc(void) +{ + unsigned long flags; + int id; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); + BUG_ON(id == 0); + if (id > 0 && id < MAX_DOMAIN_ID) + __set_bit(id, amd_iommu_pd_alloc_bitmap); + else + id = 0; + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return id; +} + +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) +{ + unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + + if (start_page + pages > last_page) + pages = last_page - start_page; + + set_bit_string(dom->bitmap, start_page, pages); +} + +static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) +{ + int i, j; + u64 *p1, *p2, *p3; + + p1 = dma_dom->domain.pt_root; + + if (!p1) + return; + + for (i = 0; i < 512; ++i) { + if (!IOMMU_PTE_PRESENT(p1[i])) + continue; + + p2 = IOMMU_PTE_PAGE(p1[i]); + for (j = 0; j < 512; ++i) { + if (!IOMMU_PTE_PRESENT(p2[j])) + continue; + p3 = IOMMU_PTE_PAGE(p2[j]); + free_page((unsigned long)p3); + } + + free_page((unsigned long)p2); + } + + free_page((unsigned long)p1); +} + +static void dma_ops_domain_free(struct dma_ops_domain *dom) +{ + if (!dom) + return; + + dma_ops_free_pagetable(dom); + + kfree(dom->pte_pages); + + kfree(dom->bitmap); + + kfree(dom); +} + +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, + unsigned order) +{ + struct dma_ops_domain *dma_dom; + unsigned i, num_pte_pages; + u64 *l2_pde; + u64 address; + + /* + * Currently the DMA aperture must be between 32 MB and 1GB in size + */ + if ((order < 25) || (order > 30)) + return NULL; + + dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); + if (!dma_dom) + return NULL; + + spin_lock_init(&dma_dom->domain.lock); + + dma_dom->domain.id = domain_id_alloc(); + if (dma_dom->domain.id == 0) + goto free_dma_dom; + dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.priv = dma_dom; + if (!dma_dom->domain.pt_root) + goto free_dma_dom; + dma_dom->aperture_size = (1ULL << order); + dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), + GFP_KERNEL); + if (!dma_dom->bitmap) + goto free_dma_dom; + /* + * mark the first page as allocated so we never return 0 as + * a valid dma-address. So we can use 0 as error value + */ + dma_dom->bitmap[0] = 1; + dma_dom->next_bit = 0; + + if (iommu->exclusion_start && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = to_pages(iommu->exclusion_start, + iommu->exclusion_length); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); + dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), + GFP_KERNEL); + if (!dma_dom->pte_pages) + goto free_dma_dom; + + l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); + if (l2_pde == NULL) + goto free_dma_dom; + + dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); + + for (i = 0; i < num_pte_pages; ++i) { + dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->pte_pages[i]) + goto free_dma_dom; + address = virt_to_phys(dma_dom->pte_pages[i]); + l2_pde[i] = IOMMU_L1_PDE(address); + } + + return dma_dom; + +free_dma_dom: + dma_ops_domain_free(dma_dom); + + return NULL; +} + -- cgit v1.1 From b20ac0d4d6cff969d7074bbf02a1b86058df0804 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:27:59 +0200 Subject: x86, AMD IOMMU: add functions to find IOMMU device resources This patch adds functions necessary to find the IOMMU resources for a specific device. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 75 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c43d15d..47e80b5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -461,3 +461,78 @@ free_dma_dom: return NULL; } +static struct protection_domain *domain_for_device(u16 devid) +{ + struct protection_domain *dom; + unsigned long flags; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = amd_iommu_pd_table[devid]; + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + +static void set_device_domain(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) +{ + unsigned long flags; + + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & 0x07) << 9; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + amd_iommu_dev_table[devid].data[0] = pte_root; + amd_iommu_dev_table[devid].data[1] = pte_root >> 32; + amd_iommu_dev_table[devid].data[2] = domain->id; + + amd_iommu_pd_table[devid] = domain; + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + iommu_queue_inv_dev_entry(iommu, devid); + + iommu->need_sync = 1; +} + +static int get_device_resources(struct device *dev, + struct amd_iommu **iommu, + struct protection_domain **domain, + u16 *bdf) +{ + struct dma_ops_domain *dma_dom; + struct pci_dev *pcidev; + u16 _bdf; + + BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); + + pcidev = to_pci_dev(dev); + _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + + if (_bdf >= amd_iommu_last_bdf) { + *iommu = NULL; + *domain = NULL; + *bdf = 0xffff; + return 0; + } + + *bdf = amd_iommu_alias_table[_bdf]; + + *iommu = amd_iommu_rlookup_table[*bdf]; + if (*iommu == NULL) + return 0; + dma_dom = (*iommu)->default_dom; + *domain = domain_for_device(*bdf); + if (*domain == NULL) { + *domain = &dma_dom->domain; + set_device_domain(*iommu, *domain, *bdf); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device ", (*domain)->id); + print_devid(_bdf, 1); + } + + return 1; +} + -- cgit v1.1 From cb76c3229725c6dcae31da65e9ca57f434628c05 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:00 +0200 Subject: x86, AMD IOMMU: add generic dma_ops mapping functions This patch adds the generic functions to map and unmap pages to a protection domain for dma_ops usage. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 47e80b5..e00a3e7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -536,3 +536,108 @@ static int get_device_resources(struct device *dev, return 1; } +static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, + struct dma_ops_domain *dom, + unsigned long address, + phys_addr_t paddr, + int direction) +{ + u64 *pte, __pte; + + WARN_ON(address > dom->aperture_size); + + paddr &= PAGE_MASK; + + pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte += IOMMU_PTE_L0_INDEX(address); + + __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + + if (direction == DMA_TO_DEVICE) + __pte |= IOMMU_PTE_IR; + else if (direction == DMA_FROM_DEVICE) + __pte |= IOMMU_PTE_IW; + else if (direction == DMA_BIDIRECTIONAL) + __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; + + WARN_ON(*pte); + + *pte = __pte; + + return (dma_addr_t)address; +} + +static void dma_ops_domain_unmap(struct amd_iommu *iommu, + struct dma_ops_domain *dom, + unsigned long address) +{ + u64 *pte; + + if (address >= dom->aperture_size) + return; + + WARN_ON(address & 0xfffULL || address > dom->aperture_size); + + pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte += IOMMU_PTE_L0_INDEX(address); + + WARN_ON(!*pte); + + *pte = 0ULL; +} + +static dma_addr_t __map_single(struct device *dev, + struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + phys_addr_t paddr, + size_t size, + int dir) +{ + dma_addr_t offset = paddr & ~PAGE_MASK; + dma_addr_t address, start; + unsigned int pages; + int i; + + pages = to_pages(paddr, size); + paddr &= PAGE_MASK; + + address = dma_ops_alloc_addresses(dev, dma_dom, pages); + if (unlikely(address == bad_dma_address)) + goto out; + + start = address; + for (i = 0; i < pages; ++i) { + dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + paddr += PAGE_SIZE; + start += PAGE_SIZE; + } + address += offset; + +out: + return address; +} + +static void __unmap_single(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + dma_addr_t dma_addr, + size_t size, + int dir) +{ + dma_addr_t i, start; + unsigned int pages; + + if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) + return; + + pages = to_pages(dma_addr, size); + dma_addr &= PAGE_MASK; + start = dma_addr; + + for (i = 0; i < pages; ++i) { + dma_ops_domain_unmap(iommu, dma_dom, start); + start += PAGE_SIZE; + } + + dma_ops_free_addresses(dma_dom, dma_addr, pages); +} + -- cgit v1.1 From 4da70b9e4f8576ec906dba9240c5b6bc6584f91d Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:01 +0200 Subject: x86, AMD IOMMU: add dma_ops mapping functions for single mappings This patch adds the dma_ops specific mapping functions for single mappings. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 59 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e00a3e7..b4079f6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -40,6 +40,11 @@ struct command { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +static int iommu_has_npcache(struct amd_iommu *iommu) +{ + return iommu->cap & IOMMU_CAP_NPCACHE; +} + static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { u32 tail, head; @@ -641,3 +646,57 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); } +static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, + size_t size, int dir) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + dma_addr_t addr; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (iommu == NULL || domain == NULL) + return (dma_addr_t)paddr; + + spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); + if (addr == bad_dma_address) + goto out; + + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return addr; +} + +static void unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int dir) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + + if (!get_device_resources(dev, &iommu, &domain, &devid)) + return; + + spin_lock_irqsave(&domain->lock, flags); + + __unmap_single(iommu, domain->priv, dma_addr, size, dir); + + iommu_flush_pages(iommu, domain->id, dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); +} + -- cgit v1.1 From 65b050adbfd9481ec20514cfc06fa596a92cb3b5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:02 +0200 Subject: x86, AMD IOMMU: add mapping functions for scatter gather lists This patch adds the dma_ops functions for mapping and unmapping scatter gather lists. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 98 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b4079f6..f4747fe 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -700,3 +700,101 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&domain->lock, flags); } +static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + struct scatterlist *s; + int i; + + for_each_sg(sglist, s, nelems, i) { + s->dma_address = (dma_addr_t)sg_phys(s); + s->dma_length = s->length; + } + + return nelems; +} + +static int map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + int i; + struct scatterlist *s; + phys_addr_t paddr; + int mapped_elems = 0; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) + return map_sg_no_iommu(dev, sglist, nelems, dir); + + spin_lock_irqsave(&domain->lock, flags); + + for_each_sg(sglist, s, nelems, i) { + paddr = sg_phys(s); + + s->dma_address = __map_single(dev, iommu, domain->priv, + paddr, s->length, dir); + + if (s->dma_address) { + s->dma_length = s->length; + mapped_elems++; + } else + goto unmap; + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, s->dma_address, + s->dma_length); + } + + if (iommu->need_sync) + iommu_completion_wait(iommu); + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return mapped_elems; +unmap: + for_each_sg(sglist, s, mapped_elems, i) { + if (s->dma_address) + __unmap_single(iommu, domain->priv, s->dma_address, + s->dma_length, dir); + s->dma_address = s->dma_length = 0; + } + + mapped_elems = 0; + + goto out; +} + +static void unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + struct scatterlist *s; + u16 devid; + int i; + + if (!get_device_resources(dev, &iommu, &domain, &devid)) + return; + + spin_lock_irqsave(&domain->lock, flags); + + for_each_sg(sglist, s, nelems, i) { + __unmap_single(iommu, domain->priv, s->dma_address, + s->dma_length, dir); + iommu_flush_pages(iommu, domain->id, s->dma_address, + s->dma_length); + s->dma_address = s->dma_length = 0; + } + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); +} + -- cgit v1.1 From 5d8b53cf3f8762b2230fb3d5b4e2ff78c5e701d8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:03 +0200 Subject: x86, AMD IOMMU: add mapping functions for coherent mappings This patch adds the dma_ops functions for coherent mappings. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f4747fe..aab9125 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -798,3 +798,77 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&domain->lock, flags); } +static void *alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long flags; + void *virt_addr; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + phys_addr_t paddr; + + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + if (!virt_addr) + return 0; + + memset(virt_addr, 0, size); + paddr = virt_to_phys(virt_addr); + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) { + *dma_addr = (dma_addr_t)paddr; + return virt_addr; + } + + spin_lock_irqsave(&domain->lock, flags); + + *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + size, DMA_BIDIRECTIONAL); + + if (*dma_addr == bad_dma_address) { + free_pages((unsigned long)virt_addr, get_order(size)); + virt_addr = NULL; + goto out; + } + + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, *dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return virt_addr; +} + +static void free_coherent(struct device *dev, size_t size, + void *virt_addr, dma_addr_t dma_addr) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) + goto free_mem; + + spin_lock_irqsave(&domain->lock, flags); + + __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + iommu_flush_pages(iommu, domain->id, dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); + +free_mem: + free_pages((unsigned long)virt_addr, get_order(size)); +} + -- cgit v1.1 From c432f3df8ea740e2ecb27da88bd6b5a254714959 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:04 +0200 Subject: x86, AMD IOMMU: add pre-allocation of protection domains This patch adds a function to pre-allocate protection domains. So we don't have to allocate it on the first request for a device (which can happen in atomic mode). Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index aab9125..bed5f82 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -872,3 +872,37 @@ free_mem: free_pages((unsigned long)virt_addr, get_order(size)); } +/* + * If the driver core informs the DMA layer if a driver grabs a device + * we don't need to preallocate the protection domains anymore. + * For now we have to. + */ +void prealloc_protection_domains(void) +{ + struct pci_dev *dev = NULL; + struct dma_ops_domain *dma_dom; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + u16 devid; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + devid = (dev->bus->number << 8) | dev->devfn; + if (devid >= amd_iommu_last_bdf) + continue; + devid = amd_iommu_alias_table[devid]; + if (domain_for_device(devid)) + continue; + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + continue; + dma_dom = dma_ops_domain_alloc(iommu, order); + if (!dma_dom) + continue; + init_unity_mappings_for_device(dma_dom, devid); + set_device_domain(iommu, &dma_dom->domain, devid); + printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", + dma_dom->domain.id); + print_devid(devid, 1); + } +} + -- cgit v1.1 From 6631ee9d00996e7663a086b5abceba65c89ff8f6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:05 +0200 Subject: x86, AMD IOMMU: add dma_ops initialization function This patch adds the function to initialize the dma_ops for the AMD IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index bed5f82..c282bde 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -906,3 +906,49 @@ void prealloc_protection_domains(void) } } +static struct dma_mapping_ops amd_iommu_dma_ops = { + .alloc_coherent = alloc_coherent, + .free_coherent = free_coherent, + .map_single = map_single, + .unmap_single = unmap_single, + .map_sg = map_sg, + .unmap_sg = unmap_sg, +}; + +int __init amd_iommu_init_dma_ops(void) +{ + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + int ret; + + list_for_each_entry(iommu, &amd_iommu_list, list) { + iommu->default_dom = dma_ops_domain_alloc(iommu, order); + if (iommu->default_dom == NULL) + return -ENOMEM; + ret = iommu_init_unity_mappings(iommu); + if (ret) + goto free_domains; + } + + if (amd_iommu_isolate) + prealloc_protection_domains(); + + iommu_detected = 1; + force_iommu = 1; + bad_dma_address = 0; + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; + + dma_ops = &amd_iommu_dma_ops; + + return 0; + +free_domains: + + list_for_each_entry(iommu, &amd_iommu_list, list) { + if (iommu->default_dom) + dma_ops_domain_free(iommu->default_dom); + } + + return ret; +} -- cgit v1.1 From c6da992e16a9d261eb9dfeff14e9777c3e0468c5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:06 +0200 Subject: x86, AMD IOMMU: add amd_iommu.h to export functions to the generic x86 dma code This patch adds the amd_iommu.h file which will be included in the generic code. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + include/asm-x86/amd_iommu.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 include/asm-x86/amd_iommu.h diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c282bde..134dea1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -25,6 +25,7 @@ #include <asm/proto.h> #include <asm/gart.h> #include <asm/amd_iommu_types.h> +#include <asm/amd_iommu.h> #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ec6f13b..bae4a76 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> +#include <asm/amd_iommu.h> #include <asm/gart.h> /* diff --git a/include/asm-x86/amd_iommu.h b/include/asm-x86/amd_iommu.h new file mode 100644 index 0000000..30a1204 --- /dev/null +++ b/include/asm-x86/amd_iommu.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * Leo Duran <leo.duran@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_H +#define _ASM_X86_AMD_IOMMU_H + +#ifdef CONFIG_AMD_IOMMU +extern int amd_iommu_init(void); +extern int amd_iommu_init_dma_ops(void); +extern void amd_iommu_detect(void); +#else +static inline int amd_iommu_init(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } +#endif + +#endif -- cgit v1.1 From 8736197ba8e40d030eec0ab7a9e9f2be41810c4e Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:07 +0200 Subject: x86, AMD IOMMU: initialize dma_ops from IOMMU initialization and enable IOMMUs This patch adds a call to the driver specific dma_ops initialization routine from the AMD IOMMU hardware initialization. Further it adds the necessary code to finally enable AMD IOMMU hardware. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index bae4a76..d1aa234 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -644,6 +644,16 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +static void __init enable_iommus(void) +{ + struct amd_iommu *iommu; + + list_for_each_entry(iommu, &amd_iommu_list, list) { + iommu_set_exclusion_range(iommu); + iommu_enable(iommu); + } +} + int __init amd_iommu_init(void) { int i, ret = 0; @@ -731,6 +741,12 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; + ret = amd_iommu_init_dma_ops(); + if (ret) + goto free; + + enable_iommus(); + printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", (1 << (amd_iommu_aperture_order-20))); -- cgit v1.1 From a69ca3401821b7312cb7ec939a8814240fd7b9b3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:08 +0200 Subject: AMD_IOMMU: call detect and initialization functions from dma code This patch adds the function calls to initialize the AMD IOMMU hardware and dma_ops to the generic DMA code for the x86 architecture. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/pci-dma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dc00a13..dea0167 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -7,6 +7,7 @@ #include <asm/dma.h> #include <asm/gart.h> #include <asm/calgary.h> +#include <asm/amd_iommu.h> int forbid_dac __read_mostly; EXPORT_SYMBOL(forbid_dac); @@ -122,6 +123,8 @@ void __init pci_iommu_alloc(void) detect_intel_iommu(); + amd_iommu_detect(); + #ifdef CONFIG_SWIOTLB pci_swiotlb_init(); #endif @@ -502,6 +505,8 @@ static int __init pci_iommu_init(void) intel_iommu_init(); + amd_iommu_init(); + #ifdef CONFIG_GART_IOMMU gart_iommu_init(); #endif -- cgit v1.1 From 919ee7dd9a66c71e116591c07b88de0db2a387a4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:09 +0200 Subject: x86, AMD IOMMU: add MAINTAINERS entry Add an entry to the MAINTAINERS file for AMD IOMMU. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8f0ec46..5e89cea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -376,6 +376,12 @@ L: linux-geode@lists.infradead.org (moderated for non-subscribers) W: http://www.amd.com/us-en/ConnectivitySolutions/TechnicalResources/0,,50_2334_2452_11363,00.html S: Supported +AMD IOMMU (AMD-VI) +P: Joerg Roedel +M: joerg.roedel@amd.com +L: iommu@lists.linux-foundation.org +S: Supported + AMS (Apple Motion Sensor) DRIVER P: Stelian Pop M: stelian@popies.net -- cgit v1.1 From 54b4cbd26987f84791d6e05863a715ded9f7b3a5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 26 Jun 2008 21:28:10 +0200 Subject: x86, AMD IOMMU: add documentation for kernel parameters Add documentation for the kernel parameters introduced with this driver. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: Sebastian.Biemueller@amd.com Cc: robert.richter@amd.com Cc: joro@8bytes.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/kernel-parameters.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e07c432..fc8f936 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -271,6 +271,18 @@ and is between 256 and 4096 characters. It is defined in the file aic79xx= [HW,SCSI] See Documentation/scsi/aic79xx.txt. + amd_iommu= [HW,X86-84] + Pass parameters to the AMD IOMMU driver in the system. + Possible values are: + off - disable the driver for AMD IOMMU + isolate - enable device isolation (each device, as far + as possible, will get its own protection + domain) + amd_iommu_size= [HW,X86-64] + Define the size of the aperture for the AMD IOMMU + driver. Possible values are: + '32M', '64M' (default), '128M', '256M', '512M', '1G' + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: <a>,<b> -- cgit v1.1 From 24d2ba0a8a5816eb8322836271fbcb045d915dfd Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 27 Jun 2008 10:37:03 +0200 Subject: x86, AMD IOMMU: build fix fix: arch/x86/kernel/amd_iommu_init.c:247: warning: 'struct acpi_table_header' declared inside parameter list arch/x86/kernel/amd_iommu_init.c:247: warning: its scope is only this definition or declaration, which is probably not what you want arch/x86/kernel/amd_iommu_init.c: In function 'find_last_devid_acpi': arch/x86/kernel/amd_iommu_init.c:257: error: dereferencing pointer to incomplete type arch/x86/kernel/amd_iommu_init.c:265: error: dereferencing pointer to incomplete type the AMD IOMMU code depends on ACPI facilities. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5a82f18..62a2820 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -552,7 +552,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT config AMD_IOMMU bool "AMD IOMMU support" select SWIOTL - depends on X86_64 && PCI + depends on X86_64 && PCI && ACPI help Select this to get support for AMD IOMMU hardware in your system. -- cgit v1.1 From 92af4e29020ff096178a00605b3662b3b39d4aa9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 27 Jun 2008 10:48:16 +0200 Subject: x86, AMD IOMMU, build fix #2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/amd_iommu.c: In function ‘amd_iommu_init_dma_ops': arch/x86/kernel/amd_iommu.c:940: error: lvalue required as left operand of assignment arch/x86/kernel/amd_iommu.c:941: error: lvalue required as left operand of assignment due to !CONFIG_GART_IOMMU. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 2 ++ arch/x86/kernel/amd_iommu_init.c | 2 ++ include/asm-x86/gart.h | 5 +++-- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 134dea1..a1b3856 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -937,8 +937,10 @@ int __init amd_iommu_init_dma_ops(void) iommu_detected = 1; force_iommu = 1; bad_dma_address = 0; +#ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; +#endif dma_ops = &amd_iommu_dma_ops; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index d1aa234..6ab8128 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -801,8 +801,10 @@ void __init amd_iommu_detect(void) if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; +#ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; +#endif } } diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h index 90958ed..ec5785c 100644 --- a/include/asm-x86/gart.h +++ b/include/asm-x86/gart.h @@ -18,8 +18,9 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern int fix_aperture; #else -#define gart_iommu_aperture 0 -#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 static inline void early_gart_iommu_check(void) { -- cgit v1.1 From 07c40e8a1acdb56fca485a6deeb252ebf19509a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 27 Jun 2008 11:31:28 +0200 Subject: x86, AMD IOMMU: build fix #3 fix typo causing: arch/x86/kernel/built-in.o: In function `__unmap_single': amd_iommu.c:(.text+0x17771): undefined reference to `iommu_area_free' arch/x86/kernel/built-in.o: In function `__map_single': amd_iommu.c:(.text+0x1797a): undefined reference to `iommu_area_alloc' amd_iommu.c:(.text+0x179a2): undefined reference to `iommu_area_alloc' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 62a2820..8aae462 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -551,7 +551,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT config AMD_IOMMU bool "AMD IOMMU support" - select SWIOTL + select SWIOTLB depends on X86_64 && PCI && ACPI help Select this to get support for AMD IOMMU hardware in your system. -- cgit v1.1 From 9d8ad5d6c7fce31fd2c0fd4fe9977bda3e92e340 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Fri, 27 Jun 2008 17:22:17 +0200 Subject: x86: don't destroy %rbp on kernel-mode faults From the code: "B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too." The code then proceeds to truncate the upper 32 bits of %rbp. This means that when do_page_fault() is finally called, its prologue, do_page_fault: push %rbp movl %rsp, %rbp will put the truncated base pointer on the stack. This means that the stack tracer will not be able to follow the base-pointer changes and will see all subsequent stack frames as unreliable. This patch changes the code to use a different register (%rcx) for the checking and leaves %rbp untouched. Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi> Acked-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df..fa1c9eb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -926,11 +926,11 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq irq_return(%rip),%rbp - cmpq %rbp,RIP(%rsp) + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP(%rsp) je error_swapgs - movl %ebp,%ebp /* zero extend */ - cmpq %rbp,RIP(%rsp) + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP(%rsp) je error_swapgs cmpq $gs_change,RIP(%rsp) je error_swapgs -- cgit v1.1 From aa60d13fb04f6d5ce72e4da508a4048b934ebd24 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 27 Jun 2008 13:23:00 -0700 Subject: x86: setup: issue a null command after enabling A20 via KBC Apparently, DOS and possibly other legacy operating systems issued a null command to the keyboard controller after toggling A20, specifically "pulse output pins" with no output pins specified. This was presumably done for synchronization reasons. This has made it into at least the UHCI spec, and it has been found to cause compatibility problems when "legacy USB" is enabled (which it almost always is) to not have this byte sent. It is *NOT* clear if any of these compatibility problems has any effect on Linux. However, for maximum compatibility, issue this null command after togging A20 through the KBC. Signed-off-by: H. Peter Anvin <hpa@zytor.com> --- arch/x86/boot/a20.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 90943f8..ef6e142 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -1,7 +1,7 @@ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -95,6 +95,9 @@ static void enable_a20_kbc(void) outb(0xdf, 0x60); /* A20 on */ empty_8042(); + + outb(0xff, 0x64); /* Null command, but UHCI wants it */ + empty_8042(); } static void enable_a20_fast(void) -- cgit v1.1 From 44974c8fc1d7047abe414562e0782320f4c1f511 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Date: Mon, 30 Jun 2008 08:11:22 +0900 Subject: x86 gart: remove unnecessary set_bit_string iommu_area_alloc internally calls set_bit_string and set bits properly. This set_bit_string is unnecessary. Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: joerg.roedel@amd.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/pci-gart_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index f20c20a..021f3c6 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -104,7 +104,6 @@ static unsigned long alloc_iommu(struct device *dev, int size) size, base_index, boundary_size, 0); } if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); next_bit = offset+size; if (next_bit >= iommu_pages) { next_bit = 0; -- cgit v1.1 From ac2564c44509aba1e21d878ffd4f34ac96811762 Mon Sep 17 00:00:00 2001 From: Matti Linnanvuori <mattilinnanvuori@yahoo.com> Date: Sun, 29 Jun 2008 09:22:43 +0300 Subject: x86: add compilation checks to pci_unmap_*() macros Add compilation checks to pci_unmap_ macros. Signed-off-by: Matti Linnanvuori <mattilinnanvuori@yahoo.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/pci_32.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/asm-x86/pci_32.h b/include/asm-x86/pci_32.h index 8c4c3a0..a50d468 100644 --- a/include/asm-x86/pci_32.h +++ b/include/asm-x86/pci_32.h @@ -18,12 +18,14 @@ struct pci_dev; #define PCI_DMA_BUS_IS_PHYS (1) /* pci_unmap_{page,single} is a nop so... */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; +#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) +#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { break; } while (pci_unmap_len(PTR, LEN_NAME)) #endif /* __KERNEL__ */ -- cgit v1.1 From 7441e9cb18a1a1e5b87f516fa97b6d4abb0838e3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Mon, 30 Jun 2008 20:18:02 +0200 Subject: x86, AMD IOMMU: disable suspend/resume with IOMMU enabled (for now) This patch disables suspend/resume on machines with AMD IOMMU enabled. Real suspend/resume support for AMD IOMMU is currently being worked on. Until this is ready it will be disabled to avoid data corruption when the IOMMU is not properly re-enabled at resume. The patch is based on a similar patch for the GART driver written by Pavel Machek. The overall driver merged into tip/master is tested with parallel disk and network loads and showed no problems in a test running for 3 days. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6ab8128..5d9e45c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -21,6 +21,7 @@ #include <linux/acpi.h> #include <linux/gfp.h> #include <linux/list.h> +#include <linux/sysdev.h> #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -654,6 +655,32 @@ static void __init enable_iommus(void) } } +/* + * Suspend/Resume support + * disable suspend until real resume implemented + */ + +static int amd_iommu_resume(struct sys_device *dev) +{ + return 0; +} + +static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) +{ + return -EINVAL; +} + +static struct sysdev_class amd_iommu_sysdev_class = { + .name = "amd_iommu", + .suspend = amd_iommu_suspend, + .resume = amd_iommu_resume, +}; + +static struct sys_device device_amd_iommu = { + .id = 0, + .cls = &amd_iommu_sysdev_class, +}; + int __init amd_iommu_init(void) { int i, ret = 0; @@ -745,6 +772,14 @@ int __init amd_iommu_init(void) if (ret) goto free; + ret = sysdev_class_register(&amd_iommu_sysdev_class); + if (ret) + goto free; + + ret = sysdev_register(&device_amd_iommu); + if (ret) + goto free; + enable_iommus(); printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", -- cgit v1.1 From 908ec7afacfdc83dc10938ed1d3c38b3526034ec Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Mon, 30 Jun 2008 14:42:18 -0700 Subject: x86: remove arbitrary ELF section limit in i386 relocatable kernel Impact: build failure in maximal configurations The 32-bit x86 relocatable kernel requires an auxilliary host program to process the relocations. This program had a hard-coded arbitrary limit of a 100 ELF sections. Instead of a hard-coded limit, allocate the structures dynamically. Signed-off-by: H. Peter Anvin <hpa@zytor.com> Acked-by: Vivek Goyal <vgoyal@redhat.com> --- arch/x86/boot/compressed/relocs.c | 198 +++++++++++++++++++++----------------- 1 file changed, 110 insertions(+), 88 deletions(-) diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index edaadea..a1310c5 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -10,16 +10,20 @@ #define USE_BSD #include <endian.h> -#define MAX_SHDRS 100 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) static Elf32_Ehdr ehdr; -static Elf32_Shdr shdr[MAX_SHDRS]; -static Elf32_Sym *symtab[MAX_SHDRS]; -static Elf32_Rel *reltab[MAX_SHDRS]; -static char *strtab[MAX_SHDRS]; static unsigned long reloc_count, reloc_idx; static unsigned long *relocs; +struct section { + Elf32_Shdr shdr; + struct section *link; + Elf32_Sym *symtab; + Elf32_Rel *reltab; + char *strtab; +}; +static struct section *secs; + /* * Following symbols have been audited. There values are constant and do * not change if bzImage is loaded at a different physical address than @@ -35,7 +39,7 @@ static int is_safe_abs_reloc(const char* sym_name) { int i; - for(i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { + for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { if (!strcmp(sym_name, safe_abs_relocs[i])) /* Match found */ return 1; @@ -137,10 +141,10 @@ static const char *sec_name(unsigned shndx) { const char *sec_strtab; const char *name; - sec_strtab = strtab[ehdr.e_shstrndx]; + sec_strtab = secs[ehdr.e_shstrndx].strtab; name = "<noname>"; if (shndx < ehdr.e_shnum) { - name = sec_strtab + shdr[shndx].sh_name; + name = sec_strtab + secs[shndx].shdr.sh_name; } else if (shndx == SHN_ABS) { name = "ABSOLUTE"; @@ -159,7 +163,7 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) name = sym_strtab + sym->st_name; } else { - name = sec_name(shdr[sym->st_shndx].sh_name); + name = sec_name(secs[sym->st_shndx].shdr.sh_name); } return name; } @@ -244,29 +248,34 @@ static void read_ehdr(FILE *fp) static void read_shdrs(FILE *fp) { int i; - if (ehdr.e_shnum > MAX_SHDRS) { - die("%d section headers supported: %d\n", - ehdr.e_shnum, MAX_SHDRS); + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); + if (!secs) { + die("Unable to allocate %d section headers\n", + ehdr.e_shnum); } if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno)); } - if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) { - die("Cannot read ELF section headers: %s\n", - strerror(errno)); - } - for(i = 0; i < ehdr.e_shnum; i++) { - shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name); - shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type); - shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags); - shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr); - shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset); - shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size); - shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link); - shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info); - shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign); - shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (fread(&shdr, sizeof shdr, 1, fp) != 1) + die("Cannot read ELF section headers %d/%d: %s\n", + i, ehdr.e_shnum, strerror(errno)); + sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); + if (sec->shdr.sh_link < ehdr.e_shnum) + sec->link = &secs[sec->shdr.sh_link]; } } @@ -274,20 +283,22 @@ static void read_shdrs(FILE *fp) static void read_strtabs(FILE *fp) { int i; - for(i = 0; i < ehdr.e_shnum; i++) { - if (shdr[i].sh_type != SHT_STRTAB) { + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { continue; } - strtab[i] = malloc(shdr[i].sh_size); - if (!strtab[i]) { + sec->strtab = malloc(sec->shdr.sh_size); + if (!sec->strtab) { die("malloc of %d bytes for strtab failed\n", - shdr[i].sh_size); + sec->shdr.sh_size); } - if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", - shdr[i].sh_offset, strerror(errno)); + sec->shdr.sh_offset, strerror(errno)); } - if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { die("Cannot read symbol table: %s\n", strerror(errno)); } @@ -297,28 +308,31 @@ static void read_strtabs(FILE *fp) static void read_symtabs(FILE *fp) { int i,j; - for(i = 0; i < ehdr.e_shnum; i++) { - if (shdr[i].sh_type != SHT_SYMTAB) { + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { continue; } - symtab[i] = malloc(shdr[i].sh_size); - if (!symtab[i]) { + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { die("malloc of %d bytes for symtab failed\n", - shdr[i].sh_size); + sec->shdr.sh_size); } - if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", - shdr[i].sh_offset, strerror(errno)); + sec->shdr.sh_offset, strerror(errno)); } - if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { die("Cannot read symbol table: %s\n", strerror(errno)); } - for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) { - symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name); - symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value); - symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size); - symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx); + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym = &sec->symtab[j]; + sym->st_name = elf32_to_cpu(sym->st_name); + sym->st_value = elf32_to_cpu(sym->st_value); + sym->st_size = elf32_to_cpu(sym->st_size); + sym->st_shndx = elf16_to_cpu(sym->st_shndx); } } } @@ -327,26 +341,29 @@ static void read_symtabs(FILE *fp) static void read_relocs(FILE *fp) { int i,j; - for(i = 0; i < ehdr.e_shnum; i++) { - if (shdr[i].sh_type != SHT_REL) { + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { continue; } - reltab[i] = malloc(shdr[i].sh_size); - if (!reltab[i]) { + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) { die("malloc of %d bytes for relocs failed\n", - shdr[i].sh_size); + sec->shdr.sh_size); } - if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", - shdr[i].sh_offset, strerror(errno)); + sec->shdr.sh_offset, strerror(errno)); } - if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { die("Cannot read symbol table: %s\n", strerror(errno)); } - for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { - reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset); - reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info); + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf32_to_cpu(rel->r_offset); + rel->r_info = elf32_to_cpu(rel->r_info); } } } @@ -357,19 +374,21 @@ static void print_absolute_symbols(void) int i; printf("Absolute symbols\n"); printf(" Num: Value Size Type Bind Visibility Name\n"); - for(i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; char *sym_strtab; Elf32_Sym *sh_symtab; int j; - if (shdr[i].sh_type != SHT_SYMTAB) { + + if (sec->shdr.sh_type != SHT_SYMTAB) { continue; } - sh_symtab = symtab[i]; - sym_strtab = strtab[shdr[i].sh_link]; - for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) { + sh_symtab = sec->symtab; + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { Elf32_Sym *sym; const char *name; - sym = &symtab[i][j]; + sym = &sec->symtab[j]; name = sym_name(sym_strtab, sym); if (sym->st_shndx != SHN_ABS) { continue; @@ -389,26 +408,27 @@ static void print_absolute_relocs(void) { int i, printed = 0; - for(i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; char *sym_strtab; Elf32_Sym *sh_symtab; - unsigned sec_applies, sec_symtab; int j; - if (shdr[i].sh_type != SHT_REL) { + if (sec->shdr.sh_type != SHT_REL) { continue; } - sec_symtab = shdr[i].sh_link; - sec_applies = shdr[i].sh_info; - if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { continue; } - sh_symtab = symtab[sec_symtab]; - sym_strtab = strtab[shdr[sec_symtab].sh_link]; - for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { Elf32_Rel *rel; Elf32_Sym *sym; const char *name; - rel = &reltab[i][j]; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; name = sym_name(sym_strtab, sym); if (sym->st_shndx != SHN_ABS) { @@ -456,26 +476,28 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) { int i; /* Walk through the relocations */ - for(i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < ehdr.e_shnum; i++) { char *sym_strtab; Elf32_Sym *sh_symtab; - unsigned sec_applies, sec_symtab; + struct section *sec_applies, *sec_symtab; int j; - if (shdr[i].sh_type != SHT_REL) { + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { continue; } - sec_symtab = shdr[i].sh_link; - sec_applies = shdr[i].sh_info; - if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { continue; } - sh_symtab = symtab[sec_symtab]; - sym_strtab = strtab[shdr[sec_symtab].sh_link]; - for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { + sh_symtab = sec_symtab->symtab; + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { Elf32_Rel *rel; Elf32_Sym *sym; unsigned r_type; - rel = &reltab[i][j]; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); /* Don't visit relocations to absolute symbols */ @@ -539,7 +561,7 @@ static void emit_relocs(int as_text) */ printf(".section \".data.reloc\",\"a\"\n"); printf(".balign 4\n"); - for(i = 0; i < reloc_count; i++) { + for (i = 0; i < reloc_count; i++) { printf("\t .long 0x%08lx\n", relocs[i]); } printf("\n"); @@ -550,7 +572,7 @@ static void emit_relocs(int as_text) /* Print a stop */ printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); /* Now print each relocation */ - for(i = 0; i < reloc_count; i++) { + for (i = 0; i < reloc_count; i++) { buf[0] = (relocs[i] >> 0) & 0xff; buf[1] = (relocs[i] >> 8) & 0xff; buf[2] = (relocs[i] >> 16) & 0xff; @@ -577,7 +599,7 @@ int main(int argc, char **argv) show_absolute_relocs = 0; as_text = 0; fname = NULL; - for(i = 1; i < argc; i++) { + for (i = 1; i < argc; i++) { char *arg = argv[i]; if (*arg == '-') { if (strcmp(argv[1], "--abs-syms") == 0) { -- cgit v1.1 From 2ee2394b682c0ee99b0f083abe6c57727e6edb69 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Mon, 30 Jun 2008 15:42:47 -0700 Subject: x86: fix regression: boot failure on AMD Elan TS-5500 Jeremy Fitzhardinge wrote: > > Maybe it really does require the far jump immediately after setting PE > in cr0... > > Hm, I don't remember this paragraph being in vol 3a, section 8.9.1 > before. Is it a recent addition? > > Random failures can occur if other instructions exist between steps > 3 and 4 above. Failures will be readily seen in some situations, > such as when instructions that reference memory are inserted between > steps 3 and 4 while in system management mode. > I don't remember that, either. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/boot/pmjump.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index ab049d4..141b6e2 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -33,6 +33,8 @@ protected_mode_jump: movw %cs, %bx shll $4, %ebx addl %ebx, 2f + jmp 1f # Short jump to serialize on 386/486 +1: movw $__BOOT_DS, %cx movw $__BOOT_TSS, %di @@ -40,8 +42,6 @@ protected_mode_jump: movl %cr0, %edx orb $X86_CR0_PE, %dl # Protected mode movl %edx, %cr0 - jmp 1f # Short jump to serialize on 386/486 -1: # Transition to 32-bit mode .byte 0x66, 0xea # ljmpl opcode -- cgit v1.1 From 45fdc3a7624a4a48185a04ae0abab5f9793d8952 Mon Sep 17 00:00:00 2001 From: Roland McGrath <roland@redhat.com> Date: Mon, 30 Jun 2008 14:02:41 -0700 Subject: x86 ptrace: fix PTRACE_GETFPXREGS error ptrace has always returned only -EIO for all failures to access registers. The user_regset calls are allowed to return a more meaningful variety of errors. The REGSET_XFP calls use -ENODEV for !cpu_has_fxsr hardware. Make ptrace return the traditional -EIO instead of the error code from the user_regset call. Signed-off-by: Roland McGrath <roland@redhat.com> Cc: stable@kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/i387.c | 4 ++-- arch/x86/kernel/ptrace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 95e80e5..eb9ddd8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, int ret; if (!cpu_has_fxsr) - return -EIO; + return -ENODEV; ret = init_fpu(target); if (ret) @@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, int ret; if (!cpu_has_fxsr) - return -EIO; + return -ENODEV; ret = init_fpu(target); if (ret) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a7835f2..77040b6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) return copy_regset_to_user(child, &user_x86_32_view, REGSET_XFP, 0, sizeof(struct user_fxsr_struct), - datap); + datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET_XFP, 0, sizeof(struct user_fxsr_struct), - datap); + datap) ? -EIO : 0; #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -- cgit v1.1 From f294a8ce211bed7bfaca19bef21376a86200c421 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Tue, 1 Jul 2008 15:38:13 +0200 Subject: x86: small unifications of address printing 'man 3 printf' tells me that %p should be printed as if by %#x, but this is not true for the kernel, which does not use the '0x' prefix for the %p conversion specifier. A small cast to (void *) is also prettier than #ifdef/#else/#endif. Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/fault.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8bcb6f4..0eb70d1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -396,11 +396,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); -#ifdef CONFIG_X86_32 - printk(KERN_CONT " at %08lx\n", address); -#else - printk(KERN_CONT " at %016lx\n", address); -#endif + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); dump_pagetable(address); @@ -800,14 +796,10 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { printk( -#ifdef CONFIG_X86_32 - "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", -#else - "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", -#endif + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->ip, - regs->sp, error_code); + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); } -- cgit v1.1 From bc4e0f9ae2b1b133d36c1392d0145d2997cff272 Mon Sep 17 00:00:00 2001 From: Ben Castricum <lk0806@bencastricum.nl> Date: Tue, 10 Jun 2008 13:15:12 +0200 Subject: x86: microcode: cosmetic changes First announce ourself, then start working. Currently this module reports itself when all is completed which is not most modules do. Plus some cosmetic/whitespace cleanups. Signed-off-by: Ben Castricum <lk0806@bencastricum.nl> Cc: trivial@kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/microcode.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 69729e3..9758fea 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -5,13 +5,14 @@ * 2006 Shaohua Li <shaohua.li@intel.com> * * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, + * belonging to IA-32 family - PentiumPro, Pentium II, * Pentium III, Xeon, Pentium 4, etc. * - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, - * Order Number 245472 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/245472.htm + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm * * For more information, go to http://www.urbanmyth.org/microcode * @@ -58,12 +59,12 @@ * nature of implementation. * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, * Jun Nakajima <jun.nakajima@intel.com> * Support for the microcode updates in the new format. * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode + * because we no longer hold a copy of applied microcode * in kernel memory. * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> * Fix sigmatch() macro to handle old CPUs with pf == 0. @@ -320,11 +321,11 @@ static void apply_microcode(int cpu) return; /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + spin_lock_irqsave(µcode_update_lock, flags); /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, + (unsigned long) uci->mc->bits, (unsigned long) uci->mc->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); @@ -341,7 +342,7 @@ static void apply_microcode(int cpu) return; } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", + "0x%x to 0x%x, date = %08x \n", cpu_num, uci->rev, val[1], uci->mc->hdr.date); uci->rev = val[1]; } @@ -534,7 +535,7 @@ static int cpu_request_microcode(int cpu) c->x86, c->x86_model, c->x86_mask); error = request_firmware(&firmware, name, µcode_pdev->dev); if (error) { - pr_debug("microcode: ucode data file %s load failed\n", name); + pr_debug("microcode: data file %s load failed\n", name); return error; } buf = firmware->data; @@ -805,6 +806,9 @@ static int __init microcode_init (void) { int error; + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); + error = microcode_dev_init(); if (error) return error; @@ -825,9 +829,6 @@ static int __init microcode_init (void) } register_hotcpu_notifier(&mc_cpu_notifier); - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); return 0; } -- cgit v1.1 From 6bcb13b35a2ea39be6c7cc0292b8ad1191b1a748 Mon Sep 17 00:00:00 2001 From: Ben Collins <ben.collins@canonical.com> Date: Wed, 18 Jun 2008 14:04:35 -0400 Subject: x86: config option to disable info from decompression of the kernel This patch allows the disabling of decompression messages during x86 bootup. Signed-off-by: Ben Collins <ben.collins@canonical.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig.debug | 8 ++++++++ arch/x86/boot/compressed/misc.c | 16 +++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ac1e31b..14abaa5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -16,6 +16,14 @@ config NONPROMISC_DEVMEM obviously disasterous, but specific access can be used by people debugging the kernel. +config X86_VERBOSE_BOOTUP + bool "Enable verbose x86 bootup info messages" + default y + help + Enables the informational output from the decompression stage + (e.g. bzImage) of the boot. If you disable this you will still + see errors. Disable this if you want silent bootup. + config EARLY_PRINTK bool "Early printk" if EMBEDDED default y diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index d10e727..11629e9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -202,7 +202,8 @@ static void free(void *where); static void *memset(void *s, int c, unsigned n); static void *memcpy(void *dest, const void *src, unsigned n); -static void putstr(const char *); +static void __putstr(int, const char *); +#define putstr(__x) __putstr(0, __x) #ifdef CONFIG_X86_64 #define memptr long @@ -266,11 +267,16 @@ static void scroll(void) vidmem[i] = ' '; } -static void putstr(const char *s) +static void __putstr(int error, const char *s) { int x, y, pos; char c; +#ifndef CONFIG_X86_VERBOSE_BOOTUP + if (!error) + return; +#endif + #ifdef CONFIG_X86_32 if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) @@ -363,9 +369,9 @@ static void flush_window(void) static void error(char *x) { - putstr("\n\n"); - putstr(x); - putstr("\n\n -- System halted"); + __putstr(1, "\n\n"); + __putstr(1, x); + __putstr(1, "\n\n -- System halted"); while (1) asm("hlt"); -- cgit v1.1 From 2d144e63098be47c21ad59d68a4fd17bd73a3aaf Mon Sep 17 00:00:00 2001 From: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue, 24 Jun 2008 17:12:56 -0700 Subject: x86, mce_64.c: mce_cpu_quirks being ignored Quirks getting ignored was a bug. Below patch fixes the bug, until we have the dynamic banks support. Sysfs choice configuration should not have any issues with the earlier patch as we look for NR_SYSFS_BANKS in do_machine_check(). Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Max Asbock <masbock@us.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 8c8299c..501ca1c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -463,7 +463,11 @@ static void mce_init(void *dummy) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); + if (i < NR_SYSFS_BANKS) + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + else + wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } -- cgit v1.1 From 95c60b08c6af6db2165837139da10f593462d51c Mon Sep 17 00:00:00 2001 From: Gustavo Fernando Padovan <gustavo@las.ic.unicamp.br> Date: Wed, 25 Jun 2008 04:03:19 -0300 Subject: x86: remove unnecessary #ifdef CONFIG_X86_32...#else Remove the #ifdef conditional because this comparison is already done in user_mode_vm(). Signed-off-by: Gustavo F. Padovan <gustavo@las.ic.unicamp.br> Cc: akpm@osdl.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8bcb6f4..1e64795 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,11 +55,7 @@ static inline int notify_page_fault(struct pt_regs *regs) int ret = 0; /* kprobe_running() needs smp_processor_id() */ -#ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { -#else - if (!user_mode(regs)) { -#endif preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; -- cgit v1.1 From 18d22200740b83a5d968d3236c27c310b3e8fda6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:06 +0200 Subject: x86, AMD IOMMU: more verbose Kconfig description text This patch replaces the short description text for AMD IOMMU in Kconfig with a more verbose one. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8aae462..b120cff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -554,7 +554,15 @@ config AMD_IOMMU select SWIOTLB depends on X86_64 && PCI && ACPI help - Select this to get support for AMD IOMMU hardware in your system. + With this option you can enable support for AMD IOMMU hardware in + your system. An IOMMU is a hardware component which provides + remapping of DMA memory accesses from devices. With an AMD IOMMU you + can isolate the the DMA memory of different devices and protect the + system from misbehaving device drivers or hardware. + + You can find out if your system has an AMD IOMMU if you look into + your BIOS for an option to enable it or if you have an IVRS ACPI + table. # need this always selected by IOMMU for the VIA workaround config SWIOTLB -- cgit v1.1 From 5f6a59d8ad55781d4d2ff0d327f84aaeed2c4127 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:07 +0200 Subject: x86, AMD IOMMU: remove unnecessary set_bit_string The set_bit_string call in the address allocator is not necessary because its already called in iommu_area_alloc(). Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a1b3856..329b2c3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -301,7 +301,6 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, 0, boundary_size, 0); if (likely(address != -1)) { - set_bit_string(dom->bitmap, address, pages); dom->next_bit = address + pages; address <<= PAGE_SHIFT; } else -- cgit v1.1 From 999ba417cc1a43881126d08876d5d7e653113ae3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:08 +0200 Subject: x86, AMD IOMMU: flush domain TLB when there is more than one page to flush This patch changes the domain TLB flushing behavior of the driver. When there is more than one page to flush it flushes the whole domain TLB instead of every single page. So we send only a single command to the IOMMU in every case which is faster to execute. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu.c | 14 ++++++++++---- include/asm-x86/amd_iommu_types.h | 2 ++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 329b2c3..f2766d8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -140,16 +140,22 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { - int i; + int s = 0; unsigned pages = to_pages(address, size); address &= PAGE_MASK; - for (i = 0; i < pages; ++i) { - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 0); - address += PAGE_SIZE; + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; } + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); + return 0; } diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h index 0f39550..7bfcb47 100644 --- a/include/asm-x86/amd_iommu_types.h +++ b/include/asm-x86/amd_iommu_types.h @@ -93,6 +93,8 @@ #define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 #define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 +#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL + /* macros and definitions for device table entries */ #define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_TRANSLATION 0x01 -- cgit v1.1 From 8b14518fadd9d5915827d86d5c10e602fedf042e Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:09 +0200 Subject: x86, AMD IOMMU: honor iommu=off instead of amd_iommu=off This patch removes the amd_iommu=off kernel parameter and honors the generic iommu=off parameter for the same purpose. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 5d9e45c..c54c823 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -101,8 +101,6 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); -static int __initdata amd_iommu_disable; - u16 amd_iommu_last_bdf; struct list_head amd_iommu_unity_map; unsigned amd_iommu_aperture_order = 26; @@ -686,7 +684,7 @@ int __init amd_iommu_init(void) int i, ret = 0; - if (amd_iommu_disable) { + if (no_iommu) { printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); return 0; } @@ -831,9 +829,6 @@ void __init amd_iommu_detect(void) if (swiotlb || no_iommu || iommu_detected) return; - if (amd_iommu_disable) - return; - if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; #ifdef CONFIG_GART_IOMMU @@ -846,8 +841,6 @@ void __init amd_iommu_detect(void) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strcmp(str, "off") == 0) - amd_iommu_disable = 1; if (strcmp(str, "isolate") == 0) amd_iommu_isolate = 1; } -- cgit v1.1 From c1cbebeec4a6be1c1fb54fbf2395ade2534b03c4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:10 +0200 Subject: x86, AMD IOMMU: don't try to init IOMMU if early detect code did not detect one This patch adds a check if the early detect code has found AMD IOMMU hardware descriptions and does not try to initialize hardware if the check failed. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c54c823..c970fac 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -101,6 +101,8 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +static int __initdata amd_iommu_detected; + u16 amd_iommu_last_bdf; struct list_head amd_iommu_unity_map; unsigned amd_iommu_aperture_order = 26; @@ -689,6 +691,9 @@ int __init amd_iommu_init(void) return 0; } + if (!amd_iommu_detected) + return -ENODEV; + /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -831,6 +836,7 @@ void __init amd_iommu_detect(void) if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; + amd_iommu_detected = 1; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; -- cgit v1.1 From fb339690a0506d2aca447b5e888fd6ac39ea4fa2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel <joerg.roedel@amd.com> Date: Thu, 3 Jul 2008 19:35:11 +0200 Subject: x86, AMD IOMMU: remove unnecessary code from the iommu_enable function This code removes a leftover from the iommu_enable function. The ctrl variable is assigned but never used. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Joerg Roedel <joerg.roedel@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/amd_iommu_init.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c970fac..2a13e43 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -169,14 +169,11 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) void __init iommu_enable(struct amd_iommu *iommu) { - u32 ctrl; - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); print_devid(iommu->devid, 0); printk(" cap 0x%hx\n", iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); - ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); } static u8 * __init iommu_map_mmio_space(u64 address) -- cgit v1.1 From 84e65b0a84a2c856bef36f13d122047678408b0a Mon Sep 17 00:00:00 2001 From: Richard Kennedy <richard@rsk.demon.co.uk> Date: Fri, 4 Jul 2008 13:56:16 +0100 Subject: x86: cacheline_align tss_struct The manual padding to align on cacheline size only worked in 32 bit In 64 bit the structure was not aligned and contained wasted space. use the compiler ____cachline_aligned to save space & properly align this structure. x86_64_default size goes from 9136 -> 8960 x86_64_AMD size goes from 9136 -> 8896 built & running on 2.6.26-rc8. Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/processor.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 5591052..4ab2ede 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -263,15 +263,11 @@ struct tss_struct { struct thread_struct *io_bitmap_owner; /* - * Pad the TSS to be cacheline-aligned (size is 0x100): - */ - unsigned long __cacheline_filler[35]; - /* * .. and then another 0x100 bytes for the emergency kernel stack: */ unsigned long stack[64]; -} __attribute__((packed)); +} ____cacheline_aligned; DECLARE_PER_CPU(struct tss_struct, init_tss); -- cgit v1.1 From 8b7fd21511f9b5016665814e03f7fc948bb64a98 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Date: Mon, 7 Jul 2008 16:40:26 +0900 Subject: x86: clean up amd_iommu documentation amd_iommu=off was replaced with a common parameter, iommu=off. Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Cc: joerg.roedel@amd.com Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/kernel-parameters.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fc8f936..6b757b4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -274,7 +274,6 @@ and is between 256 and 4096 characters. It is defined in the file amd_iommu= [HW,X86-84] Pass parameters to the AMD IOMMU driver in the system. Possible values are: - off - disable the driver for AMD IOMMU isolate - enable device isolation (each device, as far as possible, will get its own protection domain) -- cgit v1.1 From aa276e1cafb3ce9d01d1e837bcd67e92616013ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 9 Jun 2008 19:15:00 +0200 Subject: x86, clockevents: add C1E aware idle function C1E on AMD machines is like C3 but without control from the OS. Up to now we disabled the local apic timer for those machines as it stops when the CPU goes into C1E. This excludes those machines from high resolution timers / dynamic ticks, which hurts especially X2 based laptops. The current boot time C1E detection has another, more serious flaw as well: some BIOSes do not enable C1E until the ACPI processor module is loaded. This causes systems to stop working after that point. To work nicely with C1E enabled machines we use a separate idle function, which checks on idle entry whether C1E was enabled in the Interrupt Pending Message MSR. This allows us to do timer broadcasting for C1E and covers the late enablement of C1E as well. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 5 ++-- arch/x86/kernel/apic_64.c | 26 +---------------- arch/x86/kernel/cpu/amd.c | 30 -------------------- arch/x86/kernel/cpu/amd_64.c | 25 ----------------- arch/x86/kernel/process.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/apic.h | 3 -- kernel/time/tick-broadcast.c | 6 ++-- 7 files changed, 73 insertions(+), 88 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..c44206e 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -64,9 +64,8 @@ static int enable_local_apic __initdata; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; -/* Disable local APIC timer from the kernel commandline or via dmi quirk - or using CPU MSR check */ -int local_apic_timer_disabled; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int local_apic_timer_disabled; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd..a5cc844 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -43,7 +43,7 @@ #include <mach_ipi.h> #include <mach_apic.h> -int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; @@ -422,32 +422,8 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -/* - * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the - * C1E flag only in the secondary CPU, so when we detect the wreckage - * we already have enabled the boot CPU local apic timer. Check, if - * disable_apic_timer is set and the DUMMY flag is cleared. If yes, - * set the DUMMY flag again and force the broadcast mode in the - * clockevents layer. - */ -static void __cpuinit check_boot_apic_timer_broadcast(void) -{ - if (!disable_apic_timer || - (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) - return; - - printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); - lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, - &boot_cpu_physical_apicid); - local_irq_disable(); -} - void __cpuinit setup_secondary_APIC_clock(void) { - check_boot_apic_timer_broadcast(); setup_APIC_timer(); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e76b49e..acc891a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,31 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -#ifdef CONFIG_X86_LOCAL_APIC - -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) -{ - u32 lo, hi; - - if (c->x86 < 0x0F) - return 0; - - /* Family 0x0f models < rev F do not have this MSR */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - if (smp_processor_id() != boot_cpu_physical_apicid) - printk(KERN_INFO "AMD C1E detected late. " - "Force timer broadcast.\n"); - return 1; - } - return 0; -} -#endif - int force_mwait __cpuinitdata; static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) @@ -285,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } -#ifdef CONFIG_X86_LOCAL_APIC - if (amd_apic_timer_broken(c)) - local_apic_timer_disabled = 1; -#endif - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index f5fc161..f8d2058 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -110,28 +110,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ -static __cpuinit int amd_apic_timer_broken(struct cpuinfo_x86 *c) -{ - u32 lo, hi; - - if (c->x86 < 0x0F) - return 0; - - /* Family 0x0f models < rev F do not have this MSR */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - if (smp_processor_id() != boot_cpu_physical_apicid) - printk(KERN_INFO "AMD C1E detected late. " - "Force timer broadcast.\n"); - return 1; - } - return 0; -} - void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -212,9 +190,6 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) amd_enable_pci_ext_cfg(c); - if (amd_apic_timer_broken(c)) - disable_apic_timer = 1; - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { unsigned long long tseg; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9fea146..68ad353 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/pm.h> +#include <linux/clockchips.h> struct kmem_cache *task_xstate_cachep; @@ -219,6 +220,68 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } +/* + * Check for AMD CPUs, which have potentially C1E support + */ +static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_AMD) + return 0; + + if (c->x86 < 0x0F) + return 0; + + /* Family 0x0f models < rev F do not have C1E */ + if (c->x86 == 0x0f && c->x86_model < 0x40) + return 0; + + return 1; +} + +/* + * C1E aware idle routine. We check for C1E active in the interrupt + * pending message MSR. If we detect C1E, then we handle it the same + * way as C3 power states (local apic timer and TSC stop) + */ +static void c1e_idle(void) +{ + static cpumask_t c1e_mask = CPU_MASK_NONE; + static int c1e_detected; + + if (need_resched()) + return; + + if (!c1e_detected) { + u32 lo, hi; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + c1e_detected = 1; + mark_tsc_unstable("TSC halt in C1E"); + printk(KERN_INFO "System has C1E enabled\n"); + } + } + + if (c1e_detected) { + int cpu = smp_processor_id(); + + if (!cpu_isset(cpu, c1e_mask)) { + cpu_set(cpu, c1e_mask); + /* Force broadcast so ACPI can not interfere */ + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, + &cpu); + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", + cpu); + } + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + default_idle(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); + } else + default_idle(); +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_SMP @@ -236,6 +299,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; + } else if (check_c1e_idle(c)) { + printk(KERN_INFO "using C1E aware idle routine\n"); + pm_idle = c1e_idle; } else pm_idle = default_idle; } diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index be9639a..3c387cd 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -38,12 +38,9 @@ extern void generic_apic_probe(void); extern int apic_verbosity; extern int timer_over_8254; extern int local_apic_timer_c2_ok; -extern int local_apic_timer_disabled; -extern int apic_runs_main_timer; extern int ioapic_force; extern int disable_apic; -extern int disable_apic_timer; /* * Basic functions accessing APICs. diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 57a1f02..67f80c2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -30,6 +30,7 @@ struct tick_device tick_broadcast_device; static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); @@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why) CLOCK_EVT_MODE_SHUTDOWN); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - dev->features |= CLOCK_EVT_FEAT_DUMMY; + tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (cpu_isset(cpu, tick_broadcast_mask)) { + if (!tick_broadcast_force && + cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); -- cgit v1.1 From 0beefa208bb3a9e581a60125703409ebe6f7fa53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Tue, 17 Jun 2008 09:12:03 +0200 Subject: x86: add C1E aware idle function, fix On Tue, 17 Jun 2008, Rafael J. Wysocki wrote: > > BTW, with the C1E patches reverted I don't get the > WARNING: at /home/rafael/src/linux-next/kernel/smp.c:215 smp_call_function_single+0x3d/0xa2 > in the log. Thomas? The BROADCAST_FORCE notification uses smp_function_call and therefor must be run with interrupts enabled. While at it, add a comment for the BROADCAST_EXIT notifier as well. Reported-and-bisected-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 68ad353..4061d63 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -267,17 +267,29 @@ static void c1e_idle(void) if (!cpu_isset(cpu, c1e_mask)) { cpu_set(cpu, c1e_mask); - /* Force broadcast so ACPI can not interfere */ + /* + * Force broadcast so ACPI can not interfere. Needs + * to run with interrupts enabled as it uses + * smp_function_call. + */ + local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); + local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + default_idle(); - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + + /* + * The switch back from broadcast mode needs to be + * called with interrupts disabled. + */ + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } -- cgit v1.1 From 24bfdca7b7da971ef9a483303a096ac6d4b3a02c Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Thu, 12 Jun 2008 20:19:22 +0200 Subject: x86/pci: Renaming k8-bus_64.c to amd_bus.c The name fits better since this is code not only for K8. Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/pci/Makefile_64 | 2 +- arch/x86/pci/amd_bus.c | 528 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/pci/k8-bus_64.c | 528 ----------------------------------------------- 3 files changed, 529 insertions(+), 529 deletions(-) create mode 100644 arch/x86/pci/amd_bus.c delete mode 100644 arch/x86/pci/k8-bus_64.c diff --git a/arch/x86/pci/Makefile_64 b/arch/x86/pci/Makefile_64 index 8fbd198..fd47068 100644 --- a/arch/x86/pci/Makefile_64 +++ b/arch/x86/pci/Makefile_64 @@ -13,5 +13,5 @@ obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_64.o direct.o mmconfig-shared.o -obj-y += k8-bus_64.o +obj-y += amd_bus.o diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c new file mode 100644 index 0000000..5c2799c --- /dev/null +++ b/arch/x86/pci/amd_bus.c @@ -0,0 +1,528 @@ +#include <linux/init.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> +#include <asm/mpspec.h> +#include <linux/cpumask.h> +#include <linux/topology.h> + +/* + * This discovers the pcibus <-> node mapping on AMD K8. + * also get peer root bus resource for io,mmio + */ + + +/* + * sub bus (transparent) will use entres from 3 to store extra from root, + * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? + */ +#define RES_NUM 16 +struct pci_root_info { + char name[12]; + unsigned int res_num; + struct resource res[RES_NUM]; + int bus_min; + int bus_max; + int node; + int link; +}; + +/* 4 at this time, it may become to 32 */ +#define PCI_ROOT_NR 4 +static int pci_root_num; +static struct pci_root_info pci_root_info[PCI_ROOT_NR]; + +#ifdef CONFIG_NUMA + +#define BUS_NR 256 + +static int mp_bus_to_node[BUS_NR]; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node = -1; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return node; + + node = mp_bus_to_node[busnum]; + + /* + * let numa_node_id to decide it later in dma_alloc_pages + * if there is no ram on that node + */ + if (node != -1 && !node_online(node)) + node = -1; + + return node; +} +#endif + +void set_pci_bus_resources_arch_default(struct pci_bus *b) +{ + int i; + int j; + struct pci_root_info *info; + + /* if only one root bus, don't need to anything */ + if (pci_root_num < 2) + return; + + for (i = 0; i < pci_root_num; i++) { + if (pci_root_info[i].bus_min == b->number) + break; + } + + if (i == pci_root_num) + return; + + info = &pci_root_info[i]; + for (j = 0; j < info->res_num; j++) { + struct resource *res; + struct resource *root; + + res = &info->res[j]; + b->resource[j] = res; + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } +} + +#define RANGE_NUM 16 + +struct res_range { + size_t start; + size_t end; +}; + +static void __init update_range(struct res_range *range, size_t start, + size_t end) +{ + int i; + int j; + + for (j = 0; j < RANGE_NUM; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* find the new spare */ + for (i = 0; i < RANGE_NUM; i++) { + if (range[i].end == 0) + break; + } + if (i < RANGE_NUM) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static void __init update_res(struct pci_root_info *info, size_t start, + size_t end, unsigned long flags, int merge) +{ + int i; + struct resource *res; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + for (i = 0; i < info->res_num; i++) { + size_t final_start, final_end; + size_t common_start, common_end; + + res = &info->res[i]; + if (res->flags != flags) + continue; + + common_start = max((size_t)res->start, start); + common_end = min((size_t)res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min((size_t)res->start, start); + final_end = max((size_t)res->end, end); + + res->start = final_start; + res->end = final_end; + return; + } + +addit: + + /* need to add that */ + if (info->res_num >= RES_NUM) + return; + + res = &info->res[info->res_num]; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + res->child = NULL; + info->res_num++; +} + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static struct pci_hostbridge_probe pci_probes[] __initdata = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, +}; + +static u64 __initdata fam10h_mmconf_start; +static u64 __initdata fam10h_mmconf_end; +static void __init get_pci_mmcfg_amd_fam10h_range(void) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + /* assume all cpus from fam10h have mmconf */ + if (boot_cpu_data.x86 < 0x10) + return; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enable */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); + + segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + fam10h_mmconf_start = base; + fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; +} + +/** + * early_fill_mp_bus_to_node() + * called before pcibios_scan_root and pci_scan_bus + * fills the mp_bus_to_cpumask array based according to the LDT Bus Number + * Registers found in the K8 northbridge + */ +static int __init early_fill_mp_bus_info(void) +{ + int i; + int j; + unsigned bus; + unsigned slot; + int found; + int node; + int link; + int def_node; + int def_link; + struct pci_root_info *info; + u32 reg; + struct resource *res; + size_t start; + size_t end; + struct res_range range[RANGE_NUM]; + u64 val; + u32 address; + +#ifdef CONFIG_NUMA + for (i = 0; i < BUS_NR; i++) + mp_bus_to_node[i] = -1; +#endif + + if (!early_pci_allowed()) + return -1; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + return 0; + + pci_root_num = 0; + for (i = 0; i < 4; i++) { + int min_bus; + int max_bus; + reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); + + /* Check if that register is enabled for bus range */ + if ((reg & 7) != 3) + continue; + + min_bus = (reg >> 16) & 0xff; + max_bus = (reg >> 24) & 0xff; + node = (reg >> 4) & 0x07; +#ifdef CONFIG_NUMA + for (j = min_bus; j <= max_bus; j++) + mp_bus_to_node[j] = (unsigned char) node; +#endif + link = (reg >> 8) & 0x03; + + info = &pci_root_info[pci_root_num]; + info->bus_min = min_bus; + info->bus_max = max_bus; + info->node = node; + info->link = link; + sprintf(info->name, "PCI Bus #%02x", min_bus); + pci_root_num++; + } + + /* get the default node and link for left over res */ + reg = read_pci_config(bus, slot, 0, 0x60); + def_node = (reg >> 8) & 0x07; + reg = read_pci_config(bus, slot, 0, 0x64); + def_link = (reg >> 8) & 0x03; + + memset(range, 0, sizeof(range)); + range[0].end = 0xffff; + /* io port resource */ + for (i = 0; i < 4; i++) { + reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); + if (!(reg & 3)) + continue; + + start = reg & 0xfff000; + reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3)); + node = reg & 0x07; + link = (reg >> 4) & 0x03; + end = (reg & 0xfff000) | 0xfff; + + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == node && info->link == link) + break; + } + if (j == pci_root_num) + continue; /* not found */ + + info = &pci_root_info[j]; + printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", + node, link, (u64)start, (u64)end); + + /* kernel only handle 16 bit only */ + if (end > 0xffff) + end = 0xffff; + update_res(info, start, end, IORESOURCE_IO, 1); + update_range(range, start, end); + } + /* add left over io port range to def node/link, [0, 0xffff] */ + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == def_node && info->link == def_link) + break; + } + if (j < pci_root_num) { + info = &pci_root_info[j]; + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + update_res(info, range[i].start, range[i].end, + IORESOURCE_IO, 1); + } + } + + memset(range, 0, sizeof(range)); + /* 0xfd00000000-0xffffffffff for HT */ + range[0].end = (0xfdULL<<32) - 1; + + /* need to take out [0, TOM) for RAM*/ + address = MSR_K8_TOP_MEM1; + rdmsrl(address, val); + end = (val & 0xffffff8000000ULL); + printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); + if (end < (1ULL<<32)) + update_range(range, 0, end - 1); + + /* get mmconfig */ + get_pci_mmcfg_amd_fam10h_range(); + /* need to take out mmconf range */ + if (fam10h_mmconf_end) { + printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); + update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + } + + /* mmio resource */ + for (i = 0; i < 8; i++) { + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = reg & 0xffffff00; /* 39:16 on 31:8*/ + start <<= 8; + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + node = reg & 0x07; + link = (reg >> 4) & 0x03; + end = (reg & 0xffffff00); + end <<= 8; + end |= 0xffff; + + /* find the position */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == node && info->link == link) + break; + } + if (j == pci_root_num) + continue; /* not found */ + + info = &pci_root_info[j]; + + printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", + node, link, (u64)start, (u64)end); + /* + * some sick allocation would have range overlap with fam10h + * mmconf range, so need to update start and end. + */ + if (fam10h_mmconf_end) { + int changed = 0; + u64 endx = 0; + if (start >= fam10h_mmconf_start && + start <= fam10h_mmconf_end) { + start = fam10h_mmconf_end + 1; + changed = 1; + } + + if (end >= fam10h_mmconf_start && + end <= fam10h_mmconf_end) { + end = fam10h_mmconf_start - 1; + changed = 1; + } + + if (start < fam10h_mmconf_start && + end > fam10h_mmconf_end) { + /* we got a hole */ + endx = fam10h_mmconf_start - 1; + update_res(info, start, endx, IORESOURCE_MEM, 0); + update_range(range, start, endx); + printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); + start = fam10h_mmconf_end + 1; + changed = 1; + } + if (changed) { + if (start <= end) { + printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); + } else { + printk(KERN_CONT "%s\n", endx?"":" ==> none"); + continue; + } + } + } + + update_res(info, start, end, IORESOURCE_MEM, 1); + update_range(range, start, end); + printk(KERN_CONT "\n"); + } + + /* need to take out [4G, TOM2) for RAM*/ + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + /* TOP_MEM2 is enabled? */ + if (val & (1<<21)) { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + end = (val & 0xffffff8000000ULL); + printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); + update_range(range, 1ULL<<32, end - 1); + } + + /* + * add left over mmio range to def node/link ? + * that is tricky, just record range in from start_min to 4G + */ + for (j = 0; j < pci_root_num; j++) { + info = &pci_root_info[j]; + if (info->node == def_node && info->link == def_link) + break; + } + if (j < pci_root_num) { + info = &pci_root_info[j]; + + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + + update_res(info, range[i].start, range[i].end, + IORESOURCE_MEM, 1); + } + } + + for (i = 0; i < pci_root_num; i++) { + int res_num; + int busnum; + + info = &pci_root_info[i]; + res_num = info->res_num; + busnum = info->bus_min; + printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", + info->bus_min, info->bus_max, info->node, info->link); + for (j = 0; j < res_num; j++) { + res = &info->res[j]; + printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", + busnum, j, + (res->flags & IORESOURCE_IO)?"io port":"mmio", + res->start, res->end); + } + } + + return 0; +} + +postcore_initcall(early_fill_mp_bus_info); diff --git a/arch/x86/pci/k8-bus_64.c b/arch/x86/pci/k8-bus_64.c deleted file mode 100644 index 5c2799c..0000000 --- a/arch/x86/pci/k8-bus_64.c +++ /dev/null @@ -1,528 +0,0 @@ -#include <linux/init.h> -#include <linux/pci.h> -#include <asm/pci-direct.h> -#include <asm/mpspec.h> -#include <linux/cpumask.h> -#include <linux/topology.h> - -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * also get peer root bus resource for io,mmio - */ - - -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; - -#ifdef CONFIG_NUMA - -#define BUS_NR 256 - -static int mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node = -1; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return node; - - node = mp_bus_to_node[busnum]; - - /* - * let numa_node_id to decide it later in dma_alloc_pages - * if there is no ram on that node - */ - if (node != -1 && !node_online(node)) - node = -1; - - return node; -} -#endif - -void set_pci_bus_resources_arch_default(struct pci_bus *b) -{ - int i; - int j; - struct pci_root_info *info; - - /* if only one root bus, don't need to anything */ - if (pci_root_num < 2) - return; - - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) - break; - } - - if (i == pci_root_num) - return; - - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { - struct resource *res; - struct resource *root; - - res = &info->res[j]; - b->resource[j] = res; - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } -} - -#define RANGE_NUM 16 - -struct res_range { - size_t start; - size_t end; -}; - -static void __init update_range(struct res_range *range, size_t start, - size_t end) -{ - int i; - int j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) -{ - int i; - struct resource *res; - - if (!merge) - goto addit; - - /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; - - res = &info->res[i]; - if (res->flags != flags) - continue; - - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); - if (common_start > common_end + 1) - continue; - - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); - - res->start = final_start; - res->end = final_end; - return; - } - -addit: - - /* need to add that */ - if (info->res_num >= RES_NUM) - return; - - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - res->child = NULL; - info->res_num++; -} - -struct pci_hostbridge_probe { - u32 bus; - u32 slot; - u32 vendor; - u32 device; -}; - -static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, -}; - -static u64 __initdata fam10h_mmconf_start; -static u64 __initdata fam10h_mmconf_end; -static void __init get_pci_mmcfg_amd_fam10h_range(void) -{ - u32 address; - u64 base, msr; - unsigned segn_busn_bits; - - /* assume all cpus from fam10h have mmconf */ - if (boot_cpu_data.x86 < 0x10) - return; - - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - - /* mmconfig is not enable */ - if (!(msr & FAM10H_MMIO_CONF_ENABLE)) - return; - - base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); - - segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & - FAM10H_MMIO_CONF_BUSRANGE_MASK; - - fam10h_mmconf_start = base; - fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; -} - -/** - * early_fill_mp_bus_to_node() - * called before pcibios_scan_root and pci_scan_bus - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge - */ -static int __init early_fill_mp_bus_info(void) -{ - int i; - int j; - unsigned bus; - unsigned slot; - int found; - int node; - int link; - int def_node; - int def_link; - struct pci_root_info *info; - u32 reg; - struct resource *res; - size_t start; - size_t end; - struct res_range range[RANGE_NUM]; - u64 val; - u32 address; - -#ifdef CONFIG_NUMA - for (i = 0; i < BUS_NR; i++) - mp_bus_to_node[i] = -1; -#endif - - if (!early_pci_allowed()) - return -1; - - found = 0; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { - u32 id; - u16 device; - u16 vendor; - - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; - id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - - vendor = id & 0xffff; - device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { - found = 1; - break; - } - } - - if (!found) - return 0; - - pci_root_num = 0; - for (i = 0; i < 4; i++) { - int min_bus; - int max_bus; - reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); - - /* Check if that register is enabled for bus range */ - if ((reg & 7) != 3) - continue; - - min_bus = (reg >> 16) & 0xff; - max_bus = (reg >> 24) & 0xff; - node = (reg >> 4) & 0x07; -#ifdef CONFIG_NUMA - for (j = min_bus; j <= max_bus; j++) - mp_bus_to_node[j] = (unsigned char) node; -#endif - link = (reg >> 8) & 0x03; - - info = &pci_root_info[pci_root_num]; - info->bus_min = min_bus; - info->bus_max = max_bus; - info->node = node; - info->link = link; - sprintf(info->name, "PCI Bus #%02x", min_bus); - pci_root_num++; - } - - /* get the default node and link for left over res */ - reg = read_pci_config(bus, slot, 0, 0x60); - def_node = (reg >> 8) & 0x07; - reg = read_pci_config(bus, slot, 0, 0x64); - def_link = (reg >> 8) & 0x03; - - memset(range, 0, sizeof(range)); - range[0].end = 0xffff; - /* io port resource */ - for (i = 0; i < 4; i++) { - reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); - if (!(reg & 3)) - continue; - - start = reg & 0xfff000; - reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3)); - node = reg & 0x07; - link = (reg >> 4) & 0x03; - end = (reg & 0xfff000) | 0xfff; - - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) - continue; /* not found */ - - info = &pci_root_info[j]; - printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", - node, link, (u64)start, (u64)end); - - /* kernel only handle 16 bit only */ - if (end > 0xffff) - end = 0xffff; - update_res(info, start, end, IORESOURCE_IO, 1); - update_range(range, start, end); - } - /* add left over io port range to def node/link, [0, 0xffff] */ - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - - update_res(info, range[i].start, range[i].end, - IORESOURCE_IO, 1); - } - } - - memset(range, 0, sizeof(range)); - /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = (0xfdULL<<32) - 1; - - /* need to take out [0, TOM) for RAM*/ - address = MSR_K8_TOP_MEM1; - rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); - printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); - if (end < (1ULL<<32)) - update_range(range, 0, end - 1); - - /* get mmconfig */ - get_pci_mmcfg_amd_fam10h_range(); - /* need to take out mmconf range */ - if (fam10h_mmconf_end) { - printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); - } - - /* mmio resource */ - for (i = 0; i < 8; i++) { - reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); - if (!(reg & 3)) - continue; - - start = reg & 0xffffff00; /* 39:16 on 31:8*/ - start <<= 8; - reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); - node = reg & 0x07; - link = (reg >> 4) & 0x03; - end = (reg & 0xffffff00); - end <<= 8; - end |= 0xffff; - - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) - continue; /* not found */ - - info = &pci_root_info[j]; - - printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", - node, link, (u64)start, (u64)end); - /* - * some sick allocation would have range overlap with fam10h - * mmconf range, so need to update start and end. - */ - if (fam10h_mmconf_end) { - int changed = 0; - u64 endx = 0; - if (start >= fam10h_mmconf_start && - start <= fam10h_mmconf_end) { - start = fam10h_mmconf_end + 1; - changed = 1; - } - - if (end >= fam10h_mmconf_start && - end <= fam10h_mmconf_end) { - end = fam10h_mmconf_start - 1; - changed = 1; - } - - if (start < fam10h_mmconf_start && - end > fam10h_mmconf_end) { - /* we got a hole */ - endx = fam10h_mmconf_start - 1; - update_res(info, start, endx, IORESOURCE_MEM, 0); - update_range(range, start, endx); - printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); - start = fam10h_mmconf_end + 1; - changed = 1; - } - if (changed) { - if (start <= end) { - printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); - } else { - printk(KERN_CONT "%s\n", endx?"":" ==> none"); - continue; - } - } - } - - update_res(info, start, end, IORESOURCE_MEM, 1); - update_range(range, start, end); - printk(KERN_CONT "\n"); - } - - /* need to take out [4G, TOM2) for RAM*/ - /* SYS_CFG */ - address = MSR_K8_SYSCFG; - rdmsrl(address, val); - /* TOP_MEM2 is enabled? */ - if (val & (1<<21)) { - /* TOP_MEM2 */ - address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); - end = (val & 0xffffff8000000ULL); - printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); - update_range(range, 1ULL<<32, end - 1); - } - - /* - * add left over mmio range to def node/link ? - * that is tricky, just record range in from start_min to 4G - */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; - - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - - update_res(info, range[i].start, range[i].end, - IORESOURCE_MEM, 1); - } - } - - for (i = 0; i < pci_root_num; i++) { - int res_num; - int busnum; - - info = &pci_root_info[i]; - res_num = info->res_num; - busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", - info->bus_min, info->bus_max, info->node, info->link); - for (j = 0; j < res_num; j++) { - res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", - busnum, j, - (res->flags & IORESOURCE_IO)?"io port":"mmio", - res->start, res->end); - } - } - - return 0; -} - -postcore_initcall(early_fill_mp_bus_info); -- cgit v1.1 From 3a27dd1ce5de08e21e0266ddf00e6f1f843bfe8b Mon Sep 17 00:00:00 2001 From: Robert Richter <robert.richter@amd.com> Date: Thu, 12 Jun 2008 20:19:23 +0200 Subject: x86: Move PCI IO ECS code to x86/pci "Form follows function". Code is now where it belongs to. Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd.c | 3 --- arch/x86/kernel/cpu/amd_64.c | 4 ---- arch/x86/kernel/cpu/cpu.h | 2 -- arch/x86/kernel/setup.c | 13 ------------- arch/x86/pci/Makefile_32 | 1 + arch/x86/pci/amd_bus.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/pci/direct.c | 16 +++++++++------- arch/x86/pci/pci.h | 1 + include/asm-x86/cpufeature.h | 2 -- 9 files changed, 43 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index acc891a..81a07ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -266,9 +266,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - - if (c->x86 == 0x10) - amd_enable_pci_ext_cfg(c); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index f8d2058..250bfe6 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -6,7 +6,6 @@ #include <asm/cacheflush.h> #include <mach_apic.h> -#include "cpu.h" extern int __cpuinit get_model_name(struct cpuinfo_x86 *c); extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c); @@ -187,9 +186,6 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x10) fam10h_check_enable_mmcfg(); - if (c->x86 == 0x10) - amd_enable_pci_ext_cfg(c); - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { unsigned long long tseg; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f5d5bb1..40ad189 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -39,5 +39,3 @@ extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); #endif /* CONFIG_X86_32 */ - -extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 20e14db..6f80b85 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -137,16 +137,3 @@ void __init setup_per_cpu_areas(void) } #endif -#define ENABLE_CF8_EXT_CFG (1ULL << 46) - -void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c) -{ - u64 reg; - rdmsrl(MSR_AMD64_NB_CFG, reg); - if (!(reg & ENABLE_CF8_EXT_CFG)) { - reg |= ENABLE_CF8_EXT_CFG; - wrmsrl(MSR_AMD64_NB_CFG, reg); - } - set_cpu_cap(c, X86_FEATURE_PCI_EXT_CFG); -} - diff --git a/arch/x86/pci/Makefile_32 b/arch/x86/pci/Makefile_32 index 89ec35d..f647e7e 100644 --- a/arch/x86/pci/Makefile_32 +++ b/arch/x86/pci/Makefile_32 @@ -22,3 +22,4 @@ pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o pci-$(CONFIG_NUMA) += mp_bus_to_node.o obj-y += $(pci-y) common.o early.o +obj-y += amd_bus.o diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 5c2799c..15f505d 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -1,5 +1,9 @@ #include <linux/init.h> #include <linux/pci.h> +#include "pci.h" + +#ifdef CONFIG_X86_64 + #include <asm/pci-direct.h> #include <asm/mpspec.h> #include <linux/cpumask.h> @@ -526,3 +530,31 @@ static int __init early_fill_mp_bus_info(void) } postcore_initcall(early_fill_mp_bus_info); + +#endif + +/* common 32/64 bit code */ + +#define ENABLE_CF8_EXT_CFG (1ULL << 46) + +static void enable_pci_io_ecs_per_cpu(void *unused) +{ + u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); + if (!(reg & ENABLE_CF8_EXT_CFG)) { + reg |= ENABLE_CF8_EXT_CFG; + wrmsrl(MSR_AMD64_NB_CFG, reg); + } +} + +static int __init enable_pci_io_ecs(void) +{ + /* assume all cpus from fam10h have IO ECS */ + if (boot_cpu_data.x86 < 0x10) + return 0; + on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1, 1); + pci_probe |= PCI_HAS_IO_ECS; + return 0; +} + +postcore_initcall(enable_pci_io_ecs); diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 27d61b6..9915293 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -265,14 +265,16 @@ void __init pci_direct_init(int type) type); if (type == 1) { raw_pci_ops = &pci_direct_conf1; - if (!raw_pci_ext_ops && cpu_has_pci_ext_cfg) { - printk(KERN_INFO "PCI: Using configuration type 1 " - "for extended access\n"); - raw_pci_ext_ops = &pci_direct_conf1; - } - } else { - raw_pci_ops = &pci_direct_conf2; + if (raw_pci_ext_ops) + return; + if (!(pci_probe & PCI_HAS_IO_ECS)) + return; + printk(KERN_INFO "PCI: Using configuration type 1 " + "for extended access\n"); + raw_pci_ext_ops = &pci_direct_conf1; + return; } + raw_pci_ops = &pci_direct_conf2; } int __init pci_direct_probe(void) diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index f3972b1..fd53db5 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -27,6 +27,7 @@ #define PCI_CAN_SKIP_ISA_ALIGN 0x8000 #define PCI_USE__CRS 0x10000 #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 +#define PCI_HAS_IO_ECS 0x40000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 40fcbba..0d609c8 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -79,7 +79,6 @@ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */ #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */ -#define X86_FEATURE_PCI_EXT_CFG (3*32+19) /* PCI extended cfg access */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ @@ -188,7 +187,6 @@ extern const char * const x86_power_flags[32]; #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_pci_ext_cfg boot_cpu_has(X86_FEATURE_PCI_EXT_CFG) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 -- cgit v1.1 From dcd32b6a1ffe6c040f8346f7fbaf4318bb8ae41c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 20 Jun 2008 08:18:09 +0200 Subject: x86: make 64-bit identify_cpu use cpu_dev we may need to move some functions to common.c later Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd_64.c | 17 +++++-- arch/x86/kernel/cpu/centaur_64.c | 16 ++++++- arch/x86/kernel/cpu/cpu.h | 6 ++- arch/x86/kernel/cpu/intel_64.c | 15 ++++++- arch/x86/kernel/setup_64.c | 96 +++++++++++++++++++++------------------- 5 files changed, 95 insertions(+), 55 deletions(-) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 250bfe6..30b7557 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -7,8 +7,7 @@ #include <mach_apic.h> -extern int __cpuinit get_model_name(struct cpuinfo_x86 *c); -extern void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c); +#include "cpu.h" int force_mwait __cpuinitdata; @@ -109,7 +108,7 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -118,7 +117,7 @@ void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } -void __cpuinit init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -200,3 +199,13 @@ void __cpuinit init_amd(struct cpuinfo_x86 *c) set_memory_4k((unsigned long)__va(tseg), 1); } } + +static struct cpu_dev amd_cpu_dev __cpuinitdata = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, + .c_early_init = early_init_amd, + .c_init = init_amd, +}; + +cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); + diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index bac96d1..13526fd 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -4,13 +4,15 @@ #include <asm/cpufeature.h> #include <asm/processor.h> -void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +#include "cpu.h" + +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 0x6 && c->x86_model >= 0xf) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } -void __cpuinit init_centaur(struct cpuinfo_x86 *c) +static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; @@ -29,3 +31,13 @@ void __cpuinit init_centaur(struct cpuinfo_x86 *c) } set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } + +static struct cpu_dev centaur_cpu_dev __cpuinitdata = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, + .c_init = init_centaur, +}; + +cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); + diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 40ad189..4d894e8 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,4 +1,6 @@ -#ifdef CONFIG_X86_32 +#ifndef ARCH_X86_CPU_H + +#define ARCH_X86_CPU_H struct cpu_model_info { int vendor; @@ -38,4 +40,4 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -#endif /* CONFIG_X86_32 */ +#endif diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index b339121..fcb1cc9 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -5,7 +5,9 @@ #include <asm/topology.h> #include <asm/numa_64.h> -void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +#include "cpu.h" + +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) @@ -48,7 +50,7 @@ static void __cpuinit srat_detect_node(void) #endif } -void __cpuinit init_intel(struct cpuinfo_x86 *c) +static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; @@ -90,3 +92,12 @@ void __cpuinit init_intel(struct cpuinfo_x86 *c) srat_detect_node(); } + +static struct cpu_dev intel_cpu_dev __cpuinitdata = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, + .c_early_init = early_init_intel, + .c_init = init_intel, +}; +cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); + diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 25afdd8..0cebf95 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -80,6 +80,8 @@ #define ARCH_SETUP #endif +#include "cpu/cpu.h" + /* * Machine setup.. */ @@ -163,6 +165,7 @@ static struct resource bss_resource = { .flags = IORESOURCE_RAM, }; +static void __init early_cpu_init(void); static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); #ifdef CONFIG_PROC_VMCORE @@ -339,6 +342,7 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; + early_cpu_init(); early_identify_cpu(&boot_cpu_data); strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); @@ -524,6 +528,19 @@ void __init setup_arch(char **cmdline_p) check_enable_amd_mmconf_dmi(); } +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ + display_cacheinfo(c); +} + +static struct cpu_dev __cpuinitdata default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", +}; +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; + int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -625,24 +642,37 @@ out: static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; - - if (!strcmp(v, "AuthenticAMD")) - c->x86_vendor = X86_VENDOR_AMD; - else if (!strcmp(v, "GenuineIntel")) - c->x86_vendor = X86_VENDOR_INTEL; - else if (!strcmp(v, "CentaurHauls")) - c->x86_vendor = X86_VENDOR_CENTAUR; - else - c->x86_vendor = X86_VENDOR_UNKNOWN; + int i; + static int printed; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + this_cpu = cpu_devs[i]; + return; + } + } + } + if (!printed) { + printed++; + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + c->x86_vendor = X86_VENDOR_UNKNOWN; } -// FIXME: Needs to use cpu_vendor_dev_register -extern void __cpuinit early_init_amd(struct cpuinfo_x86 *c); -extern void __cpuinit init_amd(struct cpuinfo_x86 *c); -extern void __cpuinit early_init_intel(struct cpuinfo_x86 *c); -extern void __cpuinit init_intel(struct cpuinfo_x86 *c); -extern void __cpuinit early_init_centaur(struct cpuinfo_x86 *c); -extern void __cpuinit init_centaur(struct cpuinfo_x86 *c); +static void __init early_cpu_init(void) +{ + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start ; + cvdev < __x86cpuvendor_end ; + cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; +} /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu @@ -722,17 +752,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - early_init_amd(c); - break; - case X86_VENDOR_INTEL: - early_init_intel(c); - break; - case X86_VENDOR_CENTAUR: - early_init_centaur(c); - break; - } + if (c->x86_vendor != X86_VENDOR_UNKNOWN && + cpu_devs[c->x86_vendor]->c_early_init) + cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); } @@ -760,24 +782,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * At the end of this section, c->x86_capability better * indicate the features this CPU genuinely supports! */ - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - init_amd(c); - break; - - case X86_VENDOR_INTEL: - init_intel(c); - break; - - case X86_VENDOR_CENTAUR: - init_centaur(c); - break; - - case X86_VENDOR_UNKNOWN: - default: - display_cacheinfo(c); - break; - } + if (this_cpu->c_init) + this_cpu->c_init(c); detect_ht(c); -- cgit v1.1 From c49c412a47b5102516d3313d4eba38cb1e968721 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 19 Jun 2008 15:30:31 -0700 Subject: x86: make 64bit identify_cpu use cpu_dev v2 v2: fix early_panic on this config: http://redhat.com/~mingo/misc/config-Thu_Jun_19_14_22_37_CEST_2008.bad reason : struct cpu_vendor_dev size is 16, need to make table to be 16 byte alignment also print out the cpu supported... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Dave Jones <davej@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_64.c | 20 ++++++++++++++++++++ arch/x86/kernel/vmlinux_64.lds.S | 1 + 2 files changed, 21 insertions(+) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 0cebf95..b789ec5 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -664,6 +664,25 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } +static void __init early_cpu_support_print(void) +{ + int i,j; + struct cpu_dev *cpu_devx; + + printk("KERNEL supported cpus:\n"); + for (i = 0; i < X86_VENDOR_NUM; i++) { + cpu_devx = cpu_devs[i]; + if (!cpu_devx) + continue; + for (j = 0; j < 2; j++) { + if (!cpu_devx->c_ident[j]) + continue; + printk(" %s %s\n", cpu_devx->c_vendor, + cpu_devx->c_ident[j]); + } + } +} + static void __init early_cpu_init(void) { struct cpu_vendor_dev *cvdev; @@ -672,6 +691,7 @@ static void __init early_cpu_init(void) cvdev < __x86cpuvendor_end ; cvdev++) cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + early_cpu_support_print(); } /* Do some early cpuid on the boot CPU to get some parameter that are diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674..b29f63b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -177,6 +177,7 @@ SECTIONS *(.con_initcall.init) } __con_initcall_end = .; + . = ALIGN(16); __x86cpuvendor_start = .; .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { *(.x86cpuvendor.init) -- cgit v1.1 From ce0c0e50f94e8c55b00a722e8c6e8d6c802be211 Mon Sep 17 00:00:00 2001 From: Andi Kleen <andi@firstfloor.org> Date: Fri, 2 May 2008 11:46:49 +0200 Subject: x86, generic: CPA add statistics about state of direct mapping v4 Add information about the mapping state of the direct mapping to /proc/meminfo. I chose /proc/meminfo because that is where all the other memory statistics are too and it is a generally useful metric even outside debugging situations. A lot of split kernel pages means the kernel will run slower. This way we can see how many large pages are really used for it and how many are split. Useful for general insight into the kernel. v2: Add hotplug locking to 64bit to plug a very obscure theoretical race. 32bit doesn't need it because it doesn't support hotadd for lowmem. Fix some typos v3: Rename dpages_cnt Add CONFIG ifdef for count update as requested by tglx Expand description v4: Fix stupid bugs added in v3 Move update_page_count to pageattr.c Signed-off-by: Andi Kleen <andi@firstfloor.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 5 +++++ arch/x86/mm/init_64.c | 7 +++++++ arch/x86/mm/pageattr.c | 35 +++++++++++++++++++++++++++++++++++ fs/proc/proc_misc.c | 7 +++++++ include/asm-x86/pgtable.h | 3 +++ 5 files changed, 57 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10..0269ac2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -162,6 +162,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte; + unsigned pages_2m = 0, pages_4k = 0; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -197,6 +198,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) is_kernel_text(addr2)) prot = PAGE_KERNEL_LARGE_EXEC; + pages_2m++; set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; @@ -213,11 +215,14 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) if (is_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; + pages_4k++; set_pte(pte, pfn_pte(pfn, prot)); } max_pfn_mapped = pfn; } } + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); } static inline int page_kills_ppro(unsigned long pagenr) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad9..5e43838 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -312,6 +312,8 @@ __meminit void early_iounmap(void *addr, unsigned long size) static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { + unsigned long pages = 0; + int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { @@ -328,9 +330,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) continue; + pages++; set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } + update_page_count(PG_LEVEL_2M, pages); return address; } @@ -350,6 +354,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { + unsigned long pages = 0; unsigned long last_map_addr = end; int i = pud_index(addr); @@ -374,6 +379,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } if (direct_gbpages) { + pages++; set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); last_map_addr = (addr & PUD_MASK) + PUD_SIZE; @@ -390,6 +396,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) unmap_low_page(pmd); } __flush_tlb_all(); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr >> PAGE_SHIFT; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b..668205b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -34,6 +34,19 @@ struct cpa_data { unsigned force_split : 1; }; +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void __meminit update_page_count(int level, unsigned long pages) +{ +#ifdef CONFIG_PROC_FS + unsigned long flags; + /* Protect against CPA */ + spin_lock_irqsave(&pgd_lock, flags); + direct_pages_count[level] += pages; + spin_unlock_irqrestore(&pgd_lock, flags); +#endif +} + #ifdef CONFIG_X86_64 static inline unsigned long highmap_start_pfn(void) @@ -500,6 +513,12 @@ static int split_large_page(pte_t *kpte, unsigned long address) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) { + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; + } + /* * Install the new, split up pagetable. Important details here: * @@ -1029,6 +1048,22 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifdef CONFIG_PROC_FS +int arch_report_meminfo(char *page) +{ + int n; + n = sprintf(page, "DirectMap4k: %8lu\n" + "DirectMap2M: %8lu\n", + direct_pages_count[PG_LEVEL_4K], + direct_pages_count[PG_LEVEL_2M]); +#ifdef CONFIG_X86_64 + n += sprintf(page + n, "DirectMap1G: %8lu\n", + direct_pages_count[PG_LEVEL_1G]); +#endif + return n; +} +#endif + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 7e277f2..15f7bd4 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +int __attribute__((weak)) arch_report_meminfo(char *page) +{ + return 0; +} + static int meminfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, len += hugetlb_report_meminfo(page + len); + len += arch_report_meminfo(page + len); + return proc_calc_metrics(page, start, off, count, eof, len); #undef K } diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 97c271b..111564f 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -369,8 +369,11 @@ enum { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_NUM }; +void update_page_count(int level, unsigned long pages); + /* * Helper function that returns the kernel pagetable entry controlling * the virtual address 'address'. NULL means no pagetable entry present. -- cgit v1.1 From 65280e613fada41704f35709b6c8952ca4b8750c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 5 May 2008 16:35:21 +0200 Subject: x86: janitor CPA statistics patch 1) Remove __meminit from update_pages_count. It is used inside split_pages() 2) Make the code depend on PROC_FS. Doing statistics for nothing is useless and not adding useless code is nice to the Linux tiny folks. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pageattr.c | 48 +++++++++++++++++++++++++---------------------- include/asm-x86/pgtable.h | 6 +++++- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 668205b..0a3f5e0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -34,18 +34,40 @@ struct cpa_data { unsigned force_split : 1; }; +#ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; -void __meminit update_page_count(int level, unsigned long pages) +void update_page_count(int level, unsigned long pages) { -#ifdef CONFIG_PROC_FS unsigned long flags; + /* Protect against CPA */ spin_lock_irqsave(&pgd_lock, flags); direct_pages_count[level] += pages; spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void split_page_count(int level) +{ + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +int arch_report_meminfo(char *page) +{ + int n = sprintf(page, "DirectMap4k: %8lu\n" + "DirectMap2M: %8lu\n", + direct_pages_count[PG_LEVEL_4K], + direct_pages_count[PG_LEVEL_2M]); +#ifdef CONFIG_X86_64 + n += sprintf(page + n, "DirectMap1G: %8lu\n", + direct_pages_count[PG_LEVEL_1G]); #endif + return n; } +#else +static inline void split_page_count(int level) { } +#endif #ifdef CONFIG_X86_64 @@ -514,10 +536,8 @@ static int split_large_page(pte_t *kpte, unsigned long address) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) { - direct_pages_count[level]--; - direct_pages_count[level - 1] += PTRS_PER_PTE; - } + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); /* * Install the new, split up pagetable. Important details here: @@ -1048,22 +1068,6 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ -#ifdef CONFIG_PROC_FS -int arch_report_meminfo(char *page) -{ - int n; - n = sprintf(page, "DirectMap4k: %8lu\n" - "DirectMap2M: %8lu\n", - direct_pages_count[PG_LEVEL_4K], - direct_pages_count[PG_LEVEL_2M]); -#ifdef CONFIG_X86_64 - n += sprintf(page + n, "DirectMap1G: %8lu\n", - direct_pages_count[PG_LEVEL_1G]); -#endif - return n; -} -#endif - /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 111564f..bd26559 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -372,7 +372,11 @@ enum { PG_LEVEL_NUM }; -void update_page_count(int level, unsigned long pages); +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif /* * Helper function that returns the kernel pagetable entry controlling -- cgit v1.1 From 6e92a5a6151f4a467b8c1bde9123c0a9d1a63339 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: add sparse annotations to ioremap arch/x86/mm/ioremap.c:308:11: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/ioremap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2b2bb3f..01d7642 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -318,8 +318,8 @@ void iounmap(volatile void __iomem *addr) * vm_area and by simply returning an address into the kernel mapping * of ISA space. So handle that here. */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) + if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) return; addr = (volatile void __iomem *) @@ -332,7 +332,7 @@ void iounmap(volatile void __iomem *addr) cpa takes care of the direct mappings. */ read_lock(&vmlist_lock); for (p = vmlist; p; p = p->next) { - if (p->addr == addr) + if (p->addr == (void __force *)addr) break; } read_unlock(&vmlist_lock); @@ -346,7 +346,7 @@ void iounmap(volatile void __iomem *addr) free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ - o = remove_vm_area((void *)addr); + o = remove_vm_area((void __force *)addr); BUG_ON(p != o || o == NULL); kfree(p); } @@ -365,7 +365,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void *)ioremap(start, PAGE_SIZE); + addr = (void __force *)ioremap(start, PAGE_SIZE); if (addr) addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); -- cgit v1.1 From 684eb0163a98bc329193b4aa4535cdd705a5dd58 Mon Sep 17 00:00:00 2001 From: Jiri Slaby <jirislaby@gmail.com> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86_64: use PAGE_OFFSET in dump_pagetables Use PAGE_OFFSET macro instead of using 0xffff810000000000UL directly. Signed-off-by: Jiri Slaby <jirislaby@gmail.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: hpa@zytor.com Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/mm/dump_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2c24bea..0bb0cae 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = { { 0, "User Space" }, #ifdef CONFIG_X86_64 { 0x8000000000000000UL, "Kernel Space" }, - { 0xffff810000000000UL, "Low Kernel Mapping" }, + { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, { __START_KERNEL_map, "High Kernel Mapping" }, -- cgit v1.1 From a0176e2485ce6468f9b74264a2fd6c19811f027a Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 16 Jun 2008 12:44:17 +0200 Subject: Revert "Revert "x86: fix ioapic bug again"" This reverts commit 0b6a39f7ebcb1c82587ce35b401c513eed41ac5c. The changes in tip/x86/apic solve this better. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 12 ++++++++++-- arch/x86/kernel/nmi_32.c | 9 +++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 4dc8600d9..a40d54f 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2130,10 +2130,14 @@ static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; + unsigned int ver; unsigned long flags; local_irq_save(flags); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + /* * get/set the timer IRQ vector: */ @@ -2146,11 +2150,15 @@ static inline void __init check_timer(void) * mode for the 8259A whenever interrupts are routed * through I/O APICs. Also IRQ0 has to be enabled in * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. + * disabled in the local APIC. Finally timer interrupts + * need to be acknowledged manually in the 8259A for + * timer_interrupt() and for the i82489DX when using + * the NMI watchdog. */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = 1; + timer_ack = !cpu_has_tsc; + timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); if (timer_over_8254 > 0) enable_8259A_irq(0); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 84160f7..11b14bb 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -26,6 +26,7 @@ #include <asm/smp.h> #include <asm/nmi.h> +#include <asm/timer.h> #include "mach_traps.h" @@ -81,7 +82,7 @@ int __init check_nmi_watchdog(void) prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) - return -1; + goto error; printk(KERN_INFO "Testing NMI watchdog ... "); @@ -118,7 +119,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) { kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - return -1; + goto error; } printk("OK.\n"); @@ -129,6 +130,10 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; +error: + timer_ack = !cpu_has_tsc; + + return -1; } static int __init setup_nmi_watchdog(char *str) -- cgit v1.1 From d11d5794e0c21a1054e6cd57381050a999ad7232 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:09:11 +0100 Subject: x86: I/O APIC: AEOI timer acknowledgement clean-ups The code that used to be in do_slow_gettimeoffset() that relied on the IRR bit of the master 8259A PIC for IRQ0 to check the state of the output timer 0 of the PIT is no longer there. As a result, there is no need to use the POLL command to acknowledge the timer interrupt in the "8259A Virtual Wire", except for the NMI watchdog when the i82489DX APIC is used (this is because this particular APIC treats NMIs as level-triggered and keeping the input asserted would keep motherboard NMI sources held off for too long). Remove the unneeded bits and adjust comments accordingly. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 20 +++++++++----------- arch/x86/kernel/io_apic_64.c | 7 ++----- arch/x86/kernel/nmi_32.c | 2 +- arch/x86/kernel/time_32.c | 3 +-- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54f..c64b3f5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2146,19 +2146,17 @@ static inline void __init check_timer(void) set_intr_gate(vector, interrupt[0]); /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. Finally timer interrupts - * need to be acknowledged manually in the 8259A for - * timer_interrupt() and for the i82489DX when using - * the NMI watchdog. + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = !cpu_has_tsc; - timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); if (timer_over_8254 > 0) enable_8259A_irq(0); @@ -2219,6 +2217,7 @@ static inline void __init check_timer(void) printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = 0; } + timer_ack = 0; printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); @@ -2237,7 +2236,6 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - timer_ack = 0; init_8259A(0); make_8259A_irq(0); apic_write_around(APIC_LVT0, APIC_DM_EXTINT); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8df..7c34e38 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1669,11 +1669,8 @@ static inline void __init check_timer(void) assign_irq_vector(0, TARGET_CPUS); /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 11b14bb..d322901 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -131,7 +131,7 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; error: - timer_ack = !cpu_has_tsc; + timer_ack = 0; return -1; } diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 2ff21f3..5f29f12 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -84,8 +84,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) if (timer_ack) { /* * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to reset the IRR bit for do_slow_gettimeoffset(). - * This will also deassert NMI lines for the watchdog if run + * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ spin_lock(&i8259A_lock); -- cgit v1.1 From ecd29476ae0143b1c3641edfa76c0fc3e9ad3021 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:09:19 +0100 Subject: x86: I/O APIC: remove parameters to fiddle with the 8259A Remove the "disable_8254_timer" and "enable_8254_timer" kernel parameters. Now that AEOI acknowledgements are no longer needed for correct timer operation, the 8259A can be kept disabled unconditionally unless interrupts, either timer or watchdog ones, are actually passed through it. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/early-quirks.c | 13 ------------- arch/x86/kernel/io_apic_32.c | 21 ++------------------- arch/x86/kernel/io_apic_64.c | 21 ++------------------- include/asm-x86/apic.h | 1 - 4 files changed, 4 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9f51e1e..84fd9f2 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -98,17 +98,6 @@ static void __init nvidia_bugs(int num, int slot, int func) } -static void __init ati_bugs(int num, int slot, int func) -{ -#ifdef CONFIG_X86_IO_APIC - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } -#endif -} - #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -126,8 +115,6 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, { PCI_VENDOR_ID_VIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, - { PCI_VENDOR_ID_ATI, PCI_ANY_ID, - PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, {} diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c64b3f5..61cf366 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -58,7 +58,6 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); -int timer_over_8254 __initdata = 1; /* * Is the SiS APIC rmw bug present ? @@ -2157,8 +2156,6 @@ static inline void __init check_timer(void) apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - if (timer_over_8254 > 0) - enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2175,7 +2172,6 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); setup_nmi(); enable_8259A_irq(0); } @@ -2195,6 +2191,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + enable_8259A_irq(0); if (timer_irq_works()) { printk("works.\n"); if (pin1 != -1) @@ -2209,6 +2206,7 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ + disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); } printk(" failed.\n"); @@ -2221,7 +2219,6 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - disable_8259A_irq(0); set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, "fasteoi"); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ @@ -2292,20 +2289,6 @@ void __init setup_IO_APIC(void) print_IO_APIC(); } -static int __init setup_disable_8254_timer(char *s) -{ - timer_over_8254 = -1; - return 1; -} -static int __init setup_enable_8254_timer(char *s) -{ - timer_over_8254 = 2; - return 1; -} - -__setup("disable_8254_timer", setup_disable_8254_timer); -__setup("enable_8254_timer", setup_enable_8254_timer); - /* * Called after all the initialization is done. If we didnt find any * APIC bugs then we can allow the modify fast path diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 7c34e38..9f16ca4 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -90,7 +90,6 @@ static int no_timer_check; static int disable_timer_pin_1 __initdata; -int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; @@ -430,20 +429,6 @@ static int __init disable_timer_pin_setup(char *arg) } __setup("disable_timer_pin_1", disable_timer_pin_setup); -static int __init setup_disable_8254_timer(char *s) -{ - timer_over_8254 = -1; - return 1; -} -static int __init setup_enable_8254_timer(char *s) -{ - timer_over_8254 = 2; - return 1; -} - -__setup("disable_8254_timer", setup_disable_8254_timer); -__setup("enable_8254_timer", setup_enable_8254_timer); - /* * Find the IRQ entry number of a certain pin. @@ -1674,8 +1659,6 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - if (timer_over_8254 > 0) - enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -1693,7 +1676,6 @@ static inline void __init check_timer(void) if (!no_timer_check && timer_irq_works()) { nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); setup_nmi(); enable_8259A_irq(0); } @@ -1715,6 +1697,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); + enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); nmi_watchdog_default(); @@ -1726,6 +1709,7 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ + disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); } apic_printk(APIC_VERBOSE," failed.\n"); @@ -1737,7 +1721,6 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - disable_8259A_irq(0); irq_desc[0].chip = &lapic_irq_type; apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index be9639a..9c2dfd9 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -36,7 +36,6 @@ extern void generic_apic_probe(void); #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; -extern int timer_over_8254; extern int local_apic_timer_c2_ok; extern int local_apic_timer_disabled; -- cgit v1.1 From e67465f1298671266a8e824c1751afdb7c08c860 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:09:26 +0100 Subject: x86: I/O APIC: clean up after a fasteoi failure Disable the 8259A when routing of the timer interrupt through the chip to the local APIC of the primary processor has failed. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 1 + arch/x86/kernel/io_apic_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 61cf366..e7b7655 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2228,6 +2228,7 @@ static inline void __init check_timer(void) printk(" works.\n"); goto out; } + disable_8259A_irq(0); apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9f16ca4..6433fc9 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1729,6 +1729,7 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE," works.\n"); goto out; } + disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_VERBOSE," failed.\n"); -- cgit v1.1 From 60134ebe795b728dbb960485a8e873c3250ada36 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:09:34 +0100 Subject: x86: I/O APIC: keep IRQ off when changing LVT registers Disable the 8259A acting in the "virtual wire" mode to keep the interrupt line inactive while fiddling with local APIC interrupt vector registers associated with its destination inputs. To be on the safe side, especially concerning flipping the trigger mode. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 2 ++ arch/x86/kernel/io_apic_64.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index e7b7655..41218ac 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2199,7 +2199,9 @@ static inline void __init check_timer(void) else add_pin_to_irq(0, apic2, pin2); if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); setup_nmi(); + enable_8259A_irq(0); } goto out; } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6433fc9..aa45a85 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1702,7 +1702,9 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE," works.\n"); nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); setup_nmi(); + enable_8259A_irq(0); } goto out; } -- cgit v1.1 From 73d08e636026bbcb413d4864ca5e917502f8a0f9 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:09:43 +0100 Subject: x86: APIC/SMP: correct the message for "nosmp" The local APIC is no longer forced off when "nosmp" has been specified. Correct the message printed. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e1cece..8c53d86 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1157,8 +1157,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated," - "forcing use of dummy APIC emulation.\n"); + printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); #ifdef CONFIG_X86_32 connect_bsp_APIC(); -- cgit v1.1 From a1133d8e4ffc2db751eb987a2f3cf8ead67927c3 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:10:16 +0100 Subject: x86: APIC/SMP: downgrade the NMI watchdog for "nosmp" If configured to use the I/O APIC, the NMI watchdog is deemed to fail if the chip has been deactivated as a result of "nosmp". Downgrade to the local APIC watchdog similarly to what is done for the UP case. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c53d86..df60bc7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1159,6 +1159,10 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); + + if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) + nmi_watchdog = NMI_LOCAL_APIC; + #ifdef CONFIG_X86_32 connect_bsp_APIC(); #endif -- cgit v1.1 From 35542c5ebced864776d90d83d1e255016fd4c084 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Wed, 21 May 2008 22:10:22 +0100 Subject: x86: I/O APIC: clean up the 8259A on a NMI watchdog failure There is no point in keeping the 8259A enabled if the I/O APIC NMI watchdog has failed and the 8259A is not used to pass through regular timer interrupts. This fixes problems with some systems where some logic gets confused. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 2 ++ arch/x86/kernel/io_apic_64.c | 2 ++ arch/x86/kernel/nmi_32.c | 4 ++++ arch/x86/kernel/nmi_64.c | 11 +++++++++-- include/asm-x86/io_apic.h | 4 ++++ 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 41218ac..7c7d88e 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -58,6 +58,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); +int timer_through_8259 __initdata; /* * Is the SiS APIC rmw bug present ? @@ -2194,6 +2195,7 @@ static inline void __init check_timer(void) enable_8259A_irq(0); if (timer_irq_works()) { printk("works.\n"); + timer_through_8259 = 1; if (pin1 != -1) replace_pin_at_irq(0, apic1, pin1, apic2, pin2); else diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index aa45a85..2a60bfb 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -90,6 +90,7 @@ static int no_timer_check; static int disable_timer_pin_1 __initdata; +int timer_through_8259 __initdata; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; @@ -1700,6 +1701,7 @@ static inline void __init check_timer(void) enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); + timer_through_8259 = 1; nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index d322901..6580dae 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -24,6 +24,8 @@ #include <linux/kdebug.h> #include <linux/slab.h> +#include <asm/i8259.h> +#include <asm/io_apic.h> #include <asm/smp.h> #include <asm/nmi.h> #include <asm/timer.h> @@ -131,6 +133,8 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; error: + if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) + disable_8259A_irq(0); timer_ack = 0; return -1; diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 5a29ded..0060e44 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -21,6 +21,8 @@ #include <linux/cpumask.h> #include <linux/kdebug.h> +#include <asm/i8259.h> +#include <asm/io_apic.h> #include <asm/smp.h> #include <asm/nmi.h> #include <asm/proto.h> @@ -90,7 +92,7 @@ int __init check_nmi_watchdog(void) prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) - return -1; + goto error; printk(KERN_INFO "Testing NMI watchdog ... "); @@ -121,7 +123,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) { kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - return -1; + goto error; } printk("OK.\n"); @@ -132,6 +134,11 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; +error: + if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) + disable_8259A_irq(0); + + return -1; } static int __init setup_nmi_watchdog(char *str) diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index d593e14..a673256 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -137,6 +137,9 @@ extern int sis_apic_bug; /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; +/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ +extern int timer_through_8259; + static inline void disable_ioapic_setup(void) { skip_ioapic_setup = 1; @@ -162,6 +165,7 @@ extern void ioapic_init_mappings(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 +static const int timer_through_8259 = 0; #endif #endif -- cgit v1.1 From 9a1c61929121cbd597a5575f82711c0db8ee1778 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:09 +0100 Subject: x86: I/O APIC: fix the name of the L-APIC IRQ handler The local APIC interrupt handler gets registered with set_irq_chip_and_handler_name(), which results in "local-APIC-edge-fasteoi" reported as the name of the handler. Fix by removing the type of the handler left over from before the generic handlers were introduced. The 64-bit variation should get fixed with the upcoming merge. NB It should really use the "edge" handler and not the "fasteoi" one, but that's a separate issue. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 7c7d88e..7064500 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2037,7 +2037,7 @@ static void unmask_lapic_irq (unsigned int irq) } static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC-edge", + .name = "local-APIC", .mask = mask_lapic_irq, .unmask = unmask_lapic_irq, .eoi = ack_apic, -- cgit v1.1 From f08252623c7981f5cea70e4fab4983a94fc52212 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:16 +0100 Subject: x86: I/O APIC: fix the name of the through-8259A handler When the through-8259A mode is used for the timer, the call to set_irq_handler() will register a NULL handler name, resulting in "IO-APIC-<NULL>" reported. Fix by calling ioapic_register_intr() as done for all the other I/O APIC interrupts. The 64-bit variation calls set_irq_chip_and_handler_name() here needlessly and should get fixed with the upcoming merge. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 7064500..c93c0d3 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1331,8 +1331,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_chip; - set_irq_handler(0, handle_edge_irq); + ioapic_register_intr(0, vector, IOAPIC_EDGE); /* * Add it to the IO-APIC irq-routing table: -- cgit v1.1 From 80d16bace63e057d30337e48d70aef0881656457 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:22 +0100 Subject: x86: I/O APIC: remove redundant 8259A {,un}masking For a better control the masking and unmasking of the timer interrupt line in the 8259A operating in the 'Virtual Wire' mode has been moved out of setup_ExtINT_IRQ0_pin() now, so remove the redundant calls from the function. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 4 ---- arch/x86/kernel/io_apic_64.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c93c0d3..6e743ec 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1310,8 +1310,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in memset(&entry,0,sizeof(entry)); - disable_8259A_irq(0); - /* mask LVT0 */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); @@ -1337,8 +1335,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(apic, pin, entry); - - enable_8259A_irq(0); } void __init print_IO_APIC(void) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 2a60bfb..83d4e11 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -906,8 +906,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in memset(&entry, 0, sizeof(entry)); - disable_8259A_irq(0); - /* mask LVT0 */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); @@ -933,8 +931,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(apic, pin, entry); - - enable_8259A_irq(0); } void __apicdebuginit print_IO_APIC(void) -- cgit v1.1 From 6b4722a7779ebadcf016fd96ce3156b6acda8a31 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:28 +0100 Subject: x86: I/O APIC: remove redundant LVT0 masking The LINT0 line of the local APIC is masked in the LVT0 entry in check_timer() before this function is ever called. Removed the redundant unmasking for better control. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 3 --- arch/x86/kernel/io_apic_64.c | 3 --- 2 files changed, 6 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 6e743ec..51e5519 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1310,9 +1310,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in memset(&entry,0,sizeof(entry)); - /* mask LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - /* * We use logical delivery to get the timer IRQ * to the first CPU. diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 83d4e11..565f19b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -906,9 +906,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in memset(&entry, 0, sizeof(entry)); - /* mask LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - /* * We use logical delivery to get the timer IRQ * to the first CPU. -- cgit v1.1 From f7633ce55b2ea2926a39d7ca9d0bb06c43edd2c2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:34 +0100 Subject: x86: I/O APIC: rename setup_ExtINT_IRQ0_pin() Rename setup_ExtINT_IRQ0_pin() to setup_timer_IRQ0_pin() to better reflect the upcoming role of a function setting up a (semi-)arbitrary I/O APIC pin appropriately for the 8254 timer. By "appropriate" the following settings are meant: edge-triggered, active-high, all the other settings per-architecture. Adjust comments to reflect code appropriately. No functional changes. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 9 +++++---- arch/x86/kernel/io_apic_64.c | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 51e5519..ce682e8 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1302,9 +1302,10 @@ static void __init setup_IO_APIC_irqs(void) } /* - * Set up the 8259A-master output pin: + * Set up the timer pin, possibly with the 8259A-master behind. */ -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, + int vector) { struct IO_APIC_route_entry entry; @@ -1324,7 +1325,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in /* * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... + * scene we may have a 8259A-master in AEOI mode ... */ ioapic_register_intr(0, vector, IOAPIC_EDGE); @@ -2183,7 +2184,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + setup_timer_IRQ0_pin(apic2, pin2, vector); enable_8259A_irq(0); if (timer_irq_works()) { printk("works.\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 565f19b..6c4635f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -897,10 +897,10 @@ static void __init setup_IO_APIC_irqs(void) } /* - * Set up the 8259A-master output pin as broadcast to all - * CPUs. + * Set up the timer pin, possibly with the 8259A-master behind. */ -static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, + int vector) { struct IO_APIC_route_entry entry; @@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in /* * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... + * scene we may have a 8259A-master in AEOI mode ... */ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); @@ -1690,7 +1690,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); -- cgit v1.1 From 24742ece8eb01b5855059020ba1c09173fd9b732 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:40 +0100 Subject: x86: I/O APIC: unmask the second-chance timer interrupt Unmask the timer interrupt line set up in the through-8259A mode explicitly after setup_timer_IRQ0_pin() has set up the I/O APIC interrupt redirection entry to let the two operations be unbound from each other. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 1 + arch/x86/kernel/io_apic_64.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ce682e8..b381f93 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2185,6 +2185,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ setup_timer_IRQ0_pin(apic2, pin2, vector); + unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { printk("works.\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6c4635f..0e20b7d 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1691,6 +1691,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); -- cgit v1.1 From 03be750559b2fe20d85dd968e08d5fe1c3accf83 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:45 +0100 Subject: x86: I/O APIC: keep the timer IRQ masked during set-up Keep the timer interrupt line masked when reconfiguring its interrupt redirection entry in the I/O APIC. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/io_apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index b381f93..63a2f64 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1316,7 +1316,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * to the first CPU. */ entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ + entry.mask = 1; /* mask IRQ now */ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 0e20b7d..ae0ac99 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -911,7 +911,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * to the first CPU. */ entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ + entry.mask = 1; /* mask IRQ now */ entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; -- cgit v1.1 From 691874fa96d6349a8b60f8ea9c2bae52ece79941 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 27 May 2008 21:19:51 +0100 Subject: x86: I/O APIC: timer through 8259A second-chance Some systems incorrectly report the ExtINTA pin of the I/O APIC as the genuine target of the timer interrupt. Here is a change that copies timer pin information found to the other pin if one has been found only. This way both a direct and a through-8259A route is tested with the pin letting these problematic systems work well enough. If no timer pin information has been found for the I/O APIC, then local APIC variations are tried only, similarly to what is done without the change (except without the misleading messages). Obviously if we try the first-chance path without being told by the BIOS to do so, we should not complain either, so do not print the message in this case. The 64-bit variation should be updated with a call to replace_pin_at_irq() which can be done with the upcoming merge. Since add_pin_to_irq() is now always called in the first-chance path, the condition to require it in the second-chance path no longer happens. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 38 ++++++++++++++++++++++++++++---------- arch/x86/kernel/io_apic_64.c | 36 +++++++++++++++++++++++++++++------- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 63a2f64..a69a59d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2122,6 +2122,7 @@ static inline void __init unlock_ExtINT_logic(void) static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; + int no_pin1 = 0; int vector; unsigned int ver; unsigned long flags; @@ -2159,10 +2160,30 @@ static inline void __init check_timer(void) printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? */ + if (no_pin1) { + add_pin_to_irq(0, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, vector); + } unmask_IO_APIC_irq(0); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { @@ -2174,26 +2195,23 @@ static inline void __init check_timer(void) goto out; } clear_IO_APIC_pin(apic1, pin1); - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " - "IO-APIC\n"); - } + if (!no_pin1) + printk(KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); - if (pin2 != -1) { + printk(KERN_INFO "...trying to set up timer (IRQ0) " + "through the 8259A ... "); printk("\n..... (found pin %d) ...", pin2); /* * legacy devices should be connected to IO APIC #0 */ + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, vector); unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { printk("works.\n"); timer_through_8259 = 1; - if (pin1 != -1) - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - else - add_pin_to_irq(0, apic2, pin2); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -2206,8 +2224,8 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); + printk(" failed.\n"); } - printk(" failed.\n"); if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ae0ac99..9a54b77 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1638,6 +1638,7 @@ static inline void __init check_timer(void) struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; unsigned long flags; + int no_pin1 = 0; local_irq_save(flags); @@ -1662,10 +1663,30 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", cfg->vector, apic1, pin1, apic2, pin2); + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? */ + if (no_pin1) { + add_pin_to_irq(0, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, vector); + } unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { nmi_watchdog_default(); @@ -1678,18 +1699,19 @@ static inline void __init check_timer(void) goto out; } clear_IO_APIC_pin(apic1, pin1); - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " - "connected to IO-APIC\n"); - } + if (!no_pin1) + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - if (pin2 != -1) { + apic_printk(APIC_VERBOSE,KERN_INFO + "...trying to set up timer (IRQ0) " + "through the 8259A ... "); apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ + /* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); enable_8259A_irq(0); @@ -1709,8 +1731,8 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_VERBOSE," failed.\n"); } - apic_printk(APIC_VERBOSE," failed.\n"); if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); -- cgit v1.1 From 071565095897dff9668cd4a2343160ae31538c27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:37 +0200 Subject: x86: move pci_routirq declaration to pci.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/pci/acpi.c | 1 - include/asm-x86/pci.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d95de2f..464279d 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -218,7 +218,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do return bus; } -extern int pci_routeirq; static int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index 30bbde0..2db14cf 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -18,6 +18,8 @@ struct pci_sysdata { #endif }; +extern int pci_routeirq; + /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node); -- cgit v1.1 From 7223daf5e1a6298e459db84442adc04d68a6a42d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: make irq_cfg static Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9a54b77..bb3033f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -61,7 +61,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, -- cgit v1.1 From 431ee79db0a9552314d787446044973a8176b57d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:35 +0200 Subject: x86: apic_64.c fix sparse warnings about shadowed variables Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd..07fda23 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -875,7 +875,7 @@ static int __init detect_init_APIC(void) void __init early_init_lapic_mapping(void) { - unsigned long apic_phys; + unsigned long phys_addr; /* * If no local APIC can be found then go out @@ -884,11 +884,11 @@ void __init early_init_lapic_mapping(void) if (!smp_found_config) return; - apic_phys = mp_lapic_addr; + phys_addr = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); + APIC_BASE, phys_addr); /* * Fetch the APIC ID of the BSP in case we have a -- cgit v1.1 From b1b57ee135ac7614184faa7d7345b5e7098cb81d Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sat, 31 May 2008 12:20:10 +0200 Subject: x86 build fix: arch/x86/kernel/io_apic_64.c: In function 'check_timer': arch/x86/kernel/io_apic_64.c:1688: error: 'vector' undeclared (first use in this function) arch/x86/kernel/io_apic_64.c:1688: error: (Each undeclared identifier is reported only once arch/x86/kernel/io_apic_64.c:1688: error: for each function it appears in.) --- arch/x86/kernel/io_apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index bb3033f..8991567 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1685,7 +1685,7 @@ static inline void __init check_timer(void) */ if (no_pin1) { add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { -- cgit v1.1 From 067fa0ff0c89d25c2136ed095c72213089d4bb4e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Thu, 29 May 2008 22:32:30 +0400 Subject: x86: IO-APIC - use NMI_NONE instead of numeric constant Not sure but maybe it is better to use NMI_DISABLED, will take a look. But for now this patch is not change anything in logic so it will not hurt/broke the kernel. For most cases nmi_watchdog assignment is by one of NMI_* macro so I think there it make sense too. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/io_apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a69a59d..5c0f8d6 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2229,7 +2229,7 @@ static inline void __init check_timer(void) if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; + nmi_watchdog = NMI_NONE; } timer_ack = 0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8991567..40a184d 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1736,7 +1736,7 @@ static inline void __init check_timer(void) if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; + nmi_watchdog = NMI_NONE; } apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); -- cgit v1.1 From ff11571b25152edfb1eb0e6feb7e0009670fe4a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 5 Jun 2008 11:17:16 +0200 Subject: x86, io-apic: fix nmi_watchdog=1 bootup hang nmi_watchdog=1 hangs on 64-bit: [ 0.250000] Detected 12.564 MHz APIC timer. [ 0.254178] APIC timer registered as dummy, due to nmi_watchdog=1! [ 0.260366] Testing NMI watchdog ... <4>WARNING: CPU#0: NMI appears to be stuck (0->0)! [ ... ] [ 0.470003] calling genl_init+0x0/0xd0 [ hard hang ] bisected it down to: git-bisect start git-bisect good 1beee8dc8cf58e3f605bd7b34d7a39939be7d8d2 git-bisect bad 11582ece0aaa2d0f94f345c08a4ab9997078a083 git-bisect bad 5479c623bb44089844022c03d4c0eb16d5b7a15f git-bisect bad cfb4c7fabeb499e1c29f9d1878968e37a938e28a git-bisect good 246dd412d31e4f5de1d43aa6422a325b785f36e4 git-bisect bad 3f8237eaff7dc1e35fa791dae095574fd974e671 git-bisect good 90e23b13ab849e2a11f00c655eb3a2011b4623be git-bisect bad 833526a34eeefc117df3191a594c3c3a4f15a9ac git-bisect good 791b93d3dfaf16c23e978bec0cc0a3dd9d855d63 git-bisect bad 65767c64068f2c93e56a1accfed5c78230ac12d7 git-bisect bad 2abc5c05dd82c188e3bdf6641a274f013348d14b git-bisect bad 317e1f2597ffb4d4db940577bbe56dc6e881ef07 | 317e1f2597ffb4d4db940577bbe56dc6e881ef07 is first bad commit | commit 317e1f2597ffb4d4db940577bbe56dc6e881ef07 | Author: Maciej W. Rozycki <macro@linux-mips.org> | Date: Wed May 21 22:10:22 2008 +0100 | x86: I/O APIC: clean up the 8259A on a NMI watchdog failure the problem is that in the dummy-lapic branch we rely on the i8259A but if the NMI watchdog fails we turn off IRQ 0 - which doesnt work too well ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 6 ++++-- arch/x86/kernel/apic_64.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..3cebf91 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -541,11 +541,13 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) + if (nmi_watchdog != NMI_IO_APIC) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else + } else { printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=1!\n"); + timer_through_8259 = 1; + } } /* Setup the lapic or request the broadcast */ diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 07fda23..6cbc668 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -413,11 +413,13 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) + if (nmi_watchdog != NMI_IO_APIC) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else + } else { printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=1!\n"); + timer_through_8259 = 1; + } setup_APIC_timer(); } -- cgit v1.1 From ab5a5be099cb20a1c006bf0e211c48502d7ebc44 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sun, 8 Jun 2008 10:13:33 +0200 Subject: Revert "x86, io-apic: fix nmi_watchdog=1 bootup hang" This reverts commit 2229ff84f01746d02fb6b79e156fb5cce48c908f. A better fix from Maciej will be merged. --- arch/x86/kernel/apic_32.c | 6 ++---- arch/x86/kernel/apic_64.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3cebf91..4b99b1b 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -541,13 +541,11 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) { + if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - } else { + else printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=1!\n"); - timer_through_8259 = 1; - } } /* Setup the lapic or request the broadcast */ diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 6cbc668..07fda23 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -413,13 +413,11 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) { + if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - } else { + else printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=1!\n"); - timer_through_8259 = 1; - } setup_APIC_timer(); } -- cgit v1.1 From 6fe9fe875691e15eda61b992e03257e68aa5ba4f Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Sun, 8 Jun 2008 10:14:40 +0200 Subject: Revert "x86: APIC/SMP: downgrade the NMI watchdog for "nosmp"" This reverts commit 791b93d3dfaf16c23e978bec0cc0a3dd9d855d63. A better fix from Maciej will be merged. --- arch/x86/kernel/smpboot.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index df60bc7..8c53d86 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1159,10 +1159,6 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); - - if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) - nmi_watchdog = NMI_LOCAL_APIC; - #ifdef CONFIG_X86_32 connect_bsp_APIC(); #endif -- cgit v1.1 From 148b50830993acc67129f09c544d9167291e5458 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 6 Jun 2008 03:27:41 +0100 Subject: x86: NMI watchdog: Downgrade helper A downgrade helper for the NMI watchdog to be used in all places where the I/O APIC watchdog may have been requested, but the I/O APIC is found not to be there or meant to be left disabled. This is so that the reconfiguration is cosistent and defined in a single place only. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/nmi.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1e36302..745d8c8 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -78,6 +78,11 @@ extern int unknown_nmi_panic; void __trigger_all_cpu_backtrace(void); #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +static inline void localise_nmi_watchdog(void) +{ + if (nmi_watchdog == NMI_IO_APIC) + nmi_watchdog = NMI_LOCAL_APIC; +} #endif void lapic_watchdog_stop(void); -- cgit v1.1 From acae7d906f2f81d814e9c3730eeb19dfd3bf3bb4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 6 Jun 2008 03:27:49 +0100 Subject: x86: APIC/UP: Downgrade the NMI watchdog for no I/O APIC If configured to use the I/O APIC, the NMI watchdog is deemed to fail if the chip will not be used in the UP configuration, because "noapic" has been specified or the chip is simply not there. Downgrade to the local APIC watchdog to rectify. The new #ifdef is ugly, I know. A proper solution is to provide suitable definitions of smp_found_config, etc. for !CONFIG_X86_IO_APIC in a header. Likewise the whole if () condition should be moved to a static inline function. Such clean-ups are beyond the scope of this change and can be done once the whole issue of the timer has been sorted out. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 4 ++++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..26e7a62 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1269,6 +1269,10 @@ int __init APIC_init_uniprocessor(void) setup_local_APIC(); +#ifdef CONFIG_X86_IO_APIC + if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +#endif + localise_nmi_watchdog(); end_local_APIC_setup(); #ifdef CONFIG_X86_IO_APIC if (smp_found_config) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 07fda23..13eac4f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -954,6 +954,8 @@ int __init APIC_init_uniprocessor(void) if (!skip_ioapic_setup && nr_ioapics) enable_IO_APIC(); + if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) + localise_nmi_watchdog(); end_local_APIC_setup(); if (smp_found_config && !skip_ioapic_setup && nr_ioapics) -- cgit v1.1 From 19662027488946e34dd54d3bb408fa3307681d7d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 6 Jun 2008 03:27:56 +0100 Subject: x86: APIC/UP: Remove redundant NMI watchdog downgrade For the UP case the NMI watchdog downgrade is done consistently in APIC_init_uniprocessor() now. Remove redundant code used only when BIOS-disabled local APIC is activated. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 26e7a62..126e871 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1154,9 +1154,6 @@ static int __init detect_init_APIC(void) if (l & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) - nmi_watchdog = NMI_LOCAL_APIC; - printk(KERN_INFO "Found and enabled local APIC!\n"); apic_pm_activate(); -- cgit v1.1 From d54db1ac9ecde9bcb8a561595b02c1d970d3a4d6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 6 Jun 2008 03:28:02 +0100 Subject: x86: APIC/SMP: Downgrade the NMI watchdog for "nosmp" If configured to use the I/O APIC, the NMI watchdog is deemed to fail if the chip has been deactivated as a result of "nosmp". Downgrade to the local APIC watchdog similarly to what is done for the UP case. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c53d86..f376034 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1159,6 +1159,9 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!max_cpus) { printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); + + localise_nmi_watchdog(); + #ifdef CONFIG_X86_32 connect_bsp_APIC(); #endif -- cgit v1.1 From d788bada2f6c49673f85338ac4c0c642e5e52cff Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 6 Jun 2008 03:28:13 +0100 Subject: x86: APIC/SMP: Downgrade the NMI watchdog for "noapic" If configured to use the I/O APIC, the NMI watchdog is deemed to fail if the chip has been deactivated as a result of "noapic". Downgrade to the local APIC watchdog similarly to what is done for the UP case. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mach-default/smpboot_hooks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/mach-default/smpboot_hooks.h b/include/asm-x86/mach-default/smpboot_hooks.h index 56d0e1f..b63c521 100644 --- a/include/asm-x86/mach-default/smpboot_hooks.h +++ b/include/asm-x86/mach-default/smpboot_hooks.h @@ -41,8 +41,10 @@ static inline void __init smpboot_setup_io_apic(void) */ if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); - else + else { nr_ioapics = 0; + localise_nmi_watchdog(); + } } static inline void smpboot_clear_io_apic(void) -- cgit v1.1 From d3f020d2f9bb9a61ca64d4eb058c9f68f827a2b4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 7 Jun 2008 19:53:56 +0400 Subject: x86, io-apic: define names for redirection table entry fields Each I/O APIC redirection table entry has a number of fields. Define names for them to eliminate reference by hard coded numbers. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/io_apic.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index a673256..dc0f55f 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -11,6 +11,15 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ +/* I/O Unit Redirection Table */ +#define IO_APIC_REDIR_VECTOR_MASK 0x000FF +#define IO_APIC_REDIR_DEST_LOGICAL 0x00800 +#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 +#define IO_APIC_REDIR_SEND_PENDING (1 << 12) +#define IO_APIC_REDIR_REMOTE_IRR (1 << 14) +#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) +#define IO_APIC_REDIR_MASKED (1 << 16) + /* * The structure of the IO-APIC: */ -- cgit v1.1 From 46b3b4ef1ea2a0892b9b38b6a0c65a3f33b504aa Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Sat, 7 Jun 2008 19:53:57 +0400 Subject: x86, io-apic: use predefined names instead of numeric constants This patch replaces some hard-coded numbers with predefined names. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 10 ++++++---- arch/x86/kernel/io_apic_64.c | 13 +++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5c0f8d6..40fbb22 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -261,25 +261,27 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign /* mask = 1 */ static void __mask_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00010000, 0); + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); } /* mask = 0 */ static void __unmask_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0, 0x00010000); + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); } /* mask = 1, trigger = 0 */ static void __mask_and_edge_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER); } /* mask = 0, trigger = 1 */ static void __unmask_and_level_IO_APIC_irq (unsigned int irq) { - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED); } static void mask_IO_APIC_irq (unsigned int irq) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 40a184d..60a022d 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -183,7 +183,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) break; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if ((reg >> 14) & 1) { + if (reg & IO_APIC_REDIR_REMOTE_IRR) { spin_unlock_irqrestore(&ioapic_lock, flags); return true; } @@ -298,7 +298,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) break; io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~0x000000ff; + reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, reg); if (!entry->next) @@ -366,10 +366,11 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + +/* mask = 0 */ +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) static void mask_IO_APIC_irq (unsigned int irq) { -- cgit v1.1 From 360624484c81d55f88b1e5f48ce24c9243ce38e5 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Date: Sun, 8 Jun 2008 13:07:18 +0200 Subject: x86: coding style fixes to arch/x86/kernel/io_apic_32.c Before: total: 91 errors, 73 warnings, 2850 lines checked After: total: 1 errors, 47 warnings, 2848 lines checked Compile tested: paolo@paolo-desktop:/tmp$ size io* text data bss dec hex filename 13836 1756 11104 26696 6848 io_apic_32.o.after 13836 1756 11104 26696 6848 io_apic_32.o.before Signed-off-by: Paolo Ciarrocchi <paolo.ciarrocchi@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 348 +++++++++++++++++++++---------------------- 1 file changed, 173 insertions(+), 175 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 40fbb22..9a924eb 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -239,7 +239,7 @@ static void __init replace_pin_at_irq(unsigned int irq, } } -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; unsigned int pin, reg; @@ -259,32 +259,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign } /* mask = 1 */ -static void __mask_IO_APIC_irq (unsigned int irq) +static void __mask_IO_APIC_irq(unsigned int irq) { __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); } /* mask = 0 */ -static void __unmask_IO_APIC_irq (unsigned int irq) +static void __unmask_IO_APIC_irq(unsigned int irq) { __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); } /* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) { __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER); } /* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) { __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, IO_APIC_REDIR_MASKED); } -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq(unsigned int irq) { unsigned long flags; @@ -293,7 +293,7 @@ static void mask_IO_APIC_irq (unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq(unsigned int irq) { unsigned long flags; @@ -305,7 +305,7 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - + /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) @@ -317,7 +317,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); } -static void clear_IO_APIC (void) +static void clear_IO_APIC(void) { int apic, pin; @@ -334,7 +334,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry = irq_2_pin + irq; unsigned int apicid_value; cpumask_t tmp; - + cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) tmp = TARGET_CPUS; @@ -363,7 +363,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <linux/kernel_stat.h> /* kstat */ # include <linux/slab.h> /* kmalloc() */ # include <linux/timer.h> - + #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -375,14 +375,14 @@ static int physical_balance __read_mostly; static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; static struct irq_cpu_info { - unsigned long * last_irq; - unsigned long * irq_delta; + unsigned long *last_irq; + unsigned long *irq_delta; unsigned long irq; } irq_cpu_data[NR_CPUS]; #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) +#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) #define IDLE_ENOUGH(cpu,now) \ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) @@ -421,8 +421,8 @@ inside: if (cpu == -1) cpu = NR_CPUS-1; } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu,now))); + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu, now))); return cpu; } @@ -432,15 +432,14 @@ static inline void balance_irq(int cpu, int irq) unsigned long now = jiffies; cpumask_t allowed_mask; unsigned int new_cpu; - + if (irqbalance_disabled) - return; + return; cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) { + if (cpu != new_cpu) set_pending_irq(irq, cpumask_of_cpu(new_cpu)); - } } static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) @@ -452,14 +451,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) if (!irq_desc[j].action) continue; /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < useful_load_threshold) continue; balance_irq(i, j); } } balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); return; } @@ -488,22 +487,22 @@ static void do_irq_balance(void) /* Is this an active IRQ or balancing disabled ? */ if (!irq_desc[j].action || irq_balancing_disabled(j)) continue; - if ( package_index == i ) - IRQ_DELTA(package_index,j) = 0; + if (package_index == i) + IRQ_DELTA(package_index, j) = 0; /* Determine the total count per processor per IRQ */ value_now = (unsigned long) kstat_cpu(i).irqs[j]; /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i,j); + delta = value_now - LAST_CPU_IRQ(i, j); /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i,j) = value_now; + LAST_CPU_IRQ(i, j) = value_now; /* Ignore IRQs whose rate is less than the clock */ if (delta < useful_load_threshold) continue; /* update the load for the processor or package total */ - IRQ_DELTA(package_index,j) += delta; + IRQ_DELTA(package_index, j) += delta; /* Keep track of the higher numbered sibling as well */ if (i != package_index) @@ -529,7 +528,8 @@ static void do_irq_balance(void) max_cpu_irq = ULONG_MAX; tryanothercpu: - /* Look for heaviest loaded processor. + /* + * Look for heaviest loaded processor. * We may come back to get the next heaviest loaded processor. * Skip processors with trivial loads. */ @@ -538,7 +538,7 @@ tryanothercpu: for_each_online_cpu(i) { if (i != CPU_TO_PACKAGEINDEX(i)) continue; - if (max_cpu_irq <= CPU_IRQ(i)) + if (max_cpu_irq <= CPU_IRQ(i)) continue; if (tmp_cpu_irq < CPU_IRQ(i)) { tmp_cpu_irq = CPU_IRQ(i); @@ -547,8 +547,9 @@ tryanothercpu: } if (tmp_loaded == -1) { - /* In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original + /* + * In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original * approach to rotate them around. */ if (!first_attempt && imbalance >= useful_load_threshold) { @@ -557,13 +558,14 @@ tryanothercpu: } goto not_worth_the_effort; } - + first_attempt = 0; /* heaviest search */ max_cpu_irq = tmp_cpu_irq; /* load */ max_loaded = tmp_loaded; /* processor */ imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* if imbalance is less than approx 10% of max load, then + + /* + * if imbalance is less than approx 10% of max load, then * observe diminishing returns action. - quit */ if (imbalance < (max_cpu_irq >> 3)) @@ -579,26 +581,25 @@ tryanotherirq: /* Is this an active IRQ? */ if (!irq_desc[j].action) continue; - if (imbalance <= IRQ_DELTA(max_loaded,j)) + if (imbalance <= IRQ_DELTA(max_loaded, j)) continue; /* Try to find the IRQ that is closest to the imbalance * without going over. */ - if (move_this_load < IRQ_DELTA(max_loaded,j)) { - move_this_load = IRQ_DELTA(max_loaded,j); + if (move_this_load < IRQ_DELTA(max_loaded, j)) { + move_this_load = IRQ_DELTA(max_loaded, j); selected_irq = j; } } - if (selected_irq == -1) { + if (selected_irq == -1) goto tryanothercpu; - } imbalance = move_this_load; - + /* For physical_balance case, we accumulated both load * values in the one of the siblings cpu_irq[], * to use the same code for physical and logical processors - * as much as possible. + * as much as possible. * * NOTE: the cpu_irq[] array holds the sum of the load for * sibling A and sibling B in the slot for the lowest numbered @@ -627,11 +628,11 @@ tryanotherirq: /* mark for change destination */ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - /* Since we made a change, come back sooner to + /* Since we made a change, come back sooner to * check for more variation. */ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); return; } goto tryanotherirq; @@ -642,7 +643,7 @@ not_worth_the_effort: * upward */ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); return; } @@ -681,13 +682,13 @@ static int __init balanced_irq_init(void) cpumask_t tmp; cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; + c = &boot_cpu_data; /* When not overwritten by the command line ask subarchitecture. */ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) irqbalance_disabled = NO_BALANCE_IRQ; if (irqbalance_disabled) return 0; - + /* disable irqbalance completely if there is only one processor online */ if (num_online_cpus() < 2) { irqbalance_disabled = 1; @@ -707,10 +708,10 @@ static int __init balanced_irq_init(void) printk(KERN_ERR "balanced_irq_init: out of memory"); goto failed; } - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].irq_delta, 0, sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq, 0, sizeof(unsigned long) * NR_IRQS); } - + printk(KERN_INFO "Starting balanced_irq\n"); if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) return 0; @@ -845,7 +846,7 @@ static int __init find_isa_irq_apic(int irq, int type) } if (i < mp_irq_entries) { int apic; - for(apic = 0; apic < nr_ioapics; apic++) { + for (apic = 0; apic < nr_ioapics; apic++) { if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) return apic; } @@ -882,7 +883,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + int irq = pin_2_irq(i, apic, mp_irqs[i].mpc_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; @@ -902,7 +903,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); /* - * This function currently is only a helper for the i386 smp boot process where + * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online * so mask in all cases should simply be TARGET_CPUS */ @@ -977,37 +978,36 @@ static int MPBIOS_polarity(int idx) /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mpc_irqflag & 3) + switch (mp_irqs[idx].mpc_irqflag & 3) { + case 0: /* conforms, ie. bus-type dependent polarity */ { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } + polarity = test_bit(bus, mp_bus_not_pci)? + default_ISA_polarity(idx): + default_PCI_polarity(idx); + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } } return polarity; } @@ -1020,69 +1020,67 @@ static int MPBIOS_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { + case 0: /* conforms, ie. bus-type dependent */ { - case 0: /* conforms, ie. bus-type dependent */ - { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); + trigger = test_bit(bus, mp_bus_not_pci)? + default_ISA_trigger(idx): + default_PCI_trigger(idx); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ break; } - case 1: /* edge */ + case MP_BUS_EISA: /* EISA pin */ { - trigger = 0; + trigger = default_EISA_trigger(idx); break; } - case 2: /* reserved */ + case MP_BUS_PCI: /* PCI pin */ { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; + /* set before the switch */ break; } - case 3: /* level */ + case MP_BUS_MCA: /* MCA pin */ { - trigger = 1; + trigger = default_MCA_trigger(idx); break; } - default: /* invalid */ + default: { printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; + trigger = 1; break; } } +#endif + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } return trigger; } @@ -1150,8 +1148,8 @@ static inline int IO_APIC_irq_trigger(int irq) for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic,pin,mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) return irq_trigger(idx); } } @@ -1166,7 +1164,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 } static int __assign_irq_vector(int irq) { - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_vector = FIRST_DEVICE_VECTOR, current_offset; int vector, offset; BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); @@ -1239,15 +1237,15 @@ static void __init setup_IO_APIC_irqs(void) /* * add it to the IO-APIC irq-routing table: */ - memset(&entry,0,sizeof(entry)); + memset(&entry, 0, sizeof(entry)); entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - idx = find_irq_entry(apic,pin,mp_INT); + idx = find_irq_entry(apic, pin, mp_INT); if (idx == -1) { if (first_notcon) { apic_printk(APIC_VERBOSE, KERN_DEBUG @@ -1291,7 +1289,7 @@ static void __init setup_IO_APIC_irqs(void) vector = assign_irq_vector(irq); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); - + if (!apic && (irq < 16)) disable_8259A_irq(irq); } @@ -1311,7 +1309,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; - memset(&entry,0,sizeof(entry)); + memset(&entry, 0, sizeof(entry)); /* * We use logical delivery to get the timer IRQ @@ -1349,7 +1347,7 @@ void __init print_IO_APIC(void) if (apic_verbosity == APIC_QUIET) return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); @@ -1454,7 +1452,7 @@ void __init print_IO_APIC(void) #if 0 -static void print_APIC_bitfield (int base) +static void print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1475,7 +1473,7 @@ static void print_APIC_bitfield (int base) } } -void /*__init*/ print_local_APIC(void * dummy) +void /*__init*/ print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; @@ -1558,7 +1556,7 @@ void /*__init*/ print_local_APIC(void * dummy) printk("\n"); } -void print_all_local_APICs (void) +void print_all_local_APICs(void) { on_each_cpu(print_local_APIC, NULL, 1, 1); } @@ -1581,11 +1579,11 @@ void /*__init*/ print_PIC(void) v = inb(0xa0) << 8 | inb(0x20); printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - outb(0x0b,0xa0); - outb(0x0b,0x20); + outb(0x0b, 0xa0); + outb(0x0b, 0x20); v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); + outb(0x0a, 0xa0); + outb(0x0a, 0x20); spin_unlock_irqrestore(&i8259A_lock, flags); @@ -1621,7 +1619,7 @@ static void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - for(apic = 0; apic < nr_ioapics; apic++) { + for (apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { @@ -1743,7 +1741,7 @@ static void __init setup_ioapic_ids_from_mpc(void) spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); - + old_id = mp_ioapics[apic].mpc_apicid; if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { @@ -1795,7 +1793,7 @@ static void __init setup_ioapic_ids_from_mpc(void) /* * Read the right value from the MPC table and * write it into the ID register. - */ + */ apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", mp_ioapics[apic].mpc_apicid); @@ -2015,7 +2013,7 @@ static void ack_apic(unsigned int irq) ack_APIC_irq(); } -static void mask_lapic_irq (unsigned int irq) +static void mask_lapic_irq(unsigned int irq) { unsigned long v; @@ -2023,7 +2021,7 @@ static void mask_lapic_irq (unsigned int irq) apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); } -static void unmask_lapic_irq (unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; @@ -2041,14 +2039,14 @@ static struct irq_chip lapic_chip __read_mostly = { static void __init setup_nmi(void) { /* - * Dirty trick to enable the NMI watchdog ... + * Dirty trick to enable the NMI watchdog ... * We put the 8259A master into AEOI mode and * unmask on all local APICs LVT0 as NMI. * * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') * is from Maciej W. Rozycki - so we do not have to EOI from * the NMI handler or the timer interrupt. - */ + */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); enable_NMI_through_LVT0(); @@ -2312,10 +2310,10 @@ void __init setup_IO_APIC(void) * Called after all the initialization is done. If we didnt find any * APIC bugs then we can allow the modify fast path */ - + static int __init io_apic_bug_finalize(void) { - if(sis_apic_bug == -1) + if (sis_apic_bug == -1) sis_apic_bug = 0; return 0; } @@ -2326,17 +2324,17 @@ struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; }; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; +static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; int i; - + data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) entry[i] = ioapic_read_entry(dev->id, i); return 0; @@ -2349,7 +2347,7 @@ static int ioapic_resume(struct sys_device *dev) unsigned long flags; union IO_APIC_reg_00 reg_00; int i; - + data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; @@ -2360,7 +2358,7 @@ static int ioapic_resume(struct sys_device *dev) io_apic_write(dev->id, 0, reg_00.raw); } spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); return 0; @@ -2374,15 +2372,15 @@ static struct sysdev_class ioapic_sysdev_class = { static int __init ioapic_init_sysfs(void) { - struct sys_device * dev; + struct sys_device *dev; int i, size, error = 0; error = sysdev_class_register(&ioapic_sysdev_class); if (error) return error; - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] + for (i = 0; i < nr_ioapics; i++) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); if (!mp_ioapic_data[i]) { @@ -2391,7 +2389,7 @@ static int __init ioapic_init_sysfs(void) } memset(mp_ioapic_data[i], 0, size); dev = &mp_ioapic_data[i]->dev; - dev->id = i; + dev->id = i; dev->cls = &ioapic_sysdev_class; error = sysdev_register(dev); if (error) { @@ -2466,7 +2464,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms msg->address_lo = MSI_ADDR_BASE_LO | ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: +MSI_ADDR_DEST_MODE_PHYSICAL: MSI_ADDR_DEST_MODE_LOGICAL) | ((INT_DELIVERY_MODE != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU: @@ -2477,7 +2475,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: +MSI_DATA_DELIVERY_FIXED: MSI_DATA_DELIVERY_LOWPRI) | MSI_DATA_VECTOR(vector); } @@ -2648,12 +2646,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) #endif /* CONFIG_HT_IRQ */ /* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration + ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -int __init io_apic_get_unique_id (int ioapic, int apic_id) +int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -2662,10 +2660,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) int i = 0; /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only * supports up to 16 on one shared APIC bus. - * + * * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full * advantage of new APIC bus architecture. */ @@ -2684,7 +2682,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) } /* - * Every APIC in a system must have a unique ID or we get lots of nice + * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (check_apicid_used(apic_id_map, apic_id)) { @@ -2701,7 +2699,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) "trying %d\n", ioapic, apic_id, i); apic_id = i; - } + } tmp = apicid_to_cpu_present(apic_id); physids_or(apic_id_map, apic_id_map, tmp); @@ -2728,7 +2726,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id) } -int __init io_apic_get_version (int ioapic) +int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -2741,7 +2739,7 @@ int __init io_apic_get_version (int ioapic) } -int __init io_apic_get_redir_entries (int ioapic) +int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -2754,7 +2752,7 @@ int __init io_apic_get_redir_entries (int ioapic) } -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) { struct IO_APIC_route_entry entry; @@ -2770,7 +2768,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a * corresponding device driver registers for this IRQ. */ - memset(&entry,0,sizeof(entry)); + memset(&entry, 0, sizeof(entry)); entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; -- cgit v1.1 From cd08d0754ecb0cb9293c8476cb33ded1d23d0d8f Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Thu, 19 Jun 2008 00:39:33 +0100 Subject: x86: fix IO APIC breakage on HP nx6325 On Thu, 19 Jun 2008, Rafael J. Wysocki wrote: > > With such a configuration the "x86: I/O APIC: timer through 8259A > > second-chance" patch should not matter, because the only change it > > introduces is an attempt to try the same I/O APIC pin again, but with the > > IRQ0 line of the master 8259A enabled. That's not a terribly unusual > > configuration and nothing should get confused in the system. > > But it _does_ get confused, really. Something certainly gets confused, but so far I am not sure which bit exactly it is, are you? > > Barring the unlikely possibility of the 8259A actually being wired to > > INTIN2 of the I/O APIC I can see two possible explanations: > > > > 1. The 8259A interrupt actually escapes to the CPU somehow and is handled > > as an ExtINTA interrupt. This would make the code in check_timer() > > decide it has found a working configuration, while actually it has been > > fooled. [...] > Here you go: > > [ 0.108006] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 > [ 0.108006] ..MP-BIOS bug: 8254 timer not connected to IO-APIC > [ 0.108006] ...trying to set up timer (IRQ0) through the 8259A ... <3> > [ 0.108006] ..... (found apic 0 pin 2) ...<3> works. > > The full dmesg is at: http://www.sisk.pl/kernel/debug/20080618/dmesg-1.log Thanks. In this case I suspect the case #1 quoted above happens, that is the 8259A manages to deliver its interrupt somehow. Note at this stage it is meant to be in the AEOI mode, so it can happily resubmit the interrupt indefinitely with no additional handling as long as it receives INTA cycles. Can you please try the patch below on top of "x86: I/O APIC: timer through 8259A second-chance" to see whether my hypothesis is true? It modifies the through-8259A setup path so that the APIC input gets masked, but the 8259A has the timer interrupt still enabled. Let me know how the timer interrupt is routed in this case. Bisected-by: "Rafael J. Wysocki" <rjw@sisk.pl> Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 60a022d..f06f5b4 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1715,6 +1715,7 @@ static inline void __init check_timer(void) /* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); + clear_IO_APIC_pin(apic2, pin2); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); -- cgit v1.1 From 7f0dbbc08d80df7ea15d8da4f0d78255891c8812 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Fri, 20 Jun 2008 01:35:07 +0100 Subject: x86: fix IO APIC breakage on HP nx6325, v2 > That helped a lot, the system seems to work normally now. > > Here's the relevant snippet from dmesg: > > [ 0.108006] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 > [ 0.108006] ..MP-BIOS bug: 8254 timer not connected to IO-APIC > [ 0.108006] ...trying to set up timer (IRQ0) through the 8259A ... <3> > [ 0.108006] ..... (found apic 0 pin 2) ...<3> failed. > [ 0.108006] ...trying to set up timer as Virtual Wire IRQ...<3> works. > > and the whole thing is at: http://www.sisk.pl/kernel/debug/20080618/dmesg-2.log Hmm, that only proved the 8259A is indeed wired to the pin #2 of the I/O APIC. > I, personally, don't have any and AMD only has SB600 documentation on its > web page (it's still marked as "AMD confidential" ;-)). Well, the IC block is most likely the same as that's not rocket science and once done there is no need to fiddle with that. That written, I am afraid there is nothing useful about the IC in the document, except that it's there and consists of an I/O APIC providing 24 inputs and the usual pair of 8259A cores. Thanks for the reference anyway. > There is an interrupt controller in there, but I'm not sure if there's any > 8259A. The northbridge is on the CPU, actually. I will praise the day someone ships an x86 machine without an 8259A core! As expressed in another mail I suspect there may actually be a direct route from the 8254 to INTIN0 in the southbridge -- this is what other bootstrap logs seen in the Internet suggest. This would mean this particular BIOS is buggy (is it the latest version?) and provides an incorrect IRQ override in its ACPI tables, for example because the responsible block has been blindly copied from a machine using a commoner wiring. This could be moderately easily fixed up with a quirk based on the PCI ID (after checking it again, we actually used to have a quirk for ATI in this area, but the way it was done suggests the issue was not understood well enough). Could you please remove the hack sent yesterday and test the patch provided below? I do hope it builds, but I have no immediate means to check it. Please report the output. The intent is to test INTIN0 directly before testing INTIN2 through the 8259A. Thanks. Aside of that, what I have gathered from your reports (please correct me if I have got it wrong) is that when the through-8259A mode is used, then after a while 8254 timer interrupts stop arriving. What's interesting, the "Virtual Wire IRQ" seems to work for you correctly (that's quite an odd setup where a local APIC input is used in the native mode -- please post /proc/interrupts for confirmation), which in turn implies the master 8259A drives its INT output as we expect. Why would the I/O APIC input have problems then? Hmm... [ mingo@elte.hu: revert the "x86: fix IO APIC breakage on HP nx6325" version. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_64.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index f06f5b4..40aa041 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -360,6 +360,26 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry->pin = pin; } +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + #define DO_ACTION(name,R,ACTION, FINAL) \ \ @@ -1680,6 +1700,11 @@ static inline void __init check_timer(void) apic2 = apic1; } + replace_pin_at_irq(0, 0, 0, apic1, pin1); + apic1 = 0; + pin1 = 0; + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? @@ -1712,10 +1737,9 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - /* replace_pin_at_irq(0, apic1, pin1, apic2, pin2); */ + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); - clear_IO_APIC_pin(apic2, pin2); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); -- cgit v1.1 From 25556c1699ad84dd6077adf67c92eba362aa7dc2 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet <jaillet.christophe@wanadoo.fr> Date: Sun, 22 Jun 2008 22:13:48 +0200 Subject: x86, arch/x86/kernel/io_apic_32.c: use kzalloc instead of kmalloc/memset 1) replace kmalloc/memset with equivalent kzalloc. Signed-off-by: Christophe Jaillet <jaillet.christophe@wanadoo.fr> Cc: cj <jaillet.christophe@wanadoo.fr> Cc: petero2@telia.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 9a924eb..e22cbfb 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -702,14 +702,12 @@ static int __init balanced_irq_init(void) physical_balance = 1; for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { printk(KERN_ERR "balanced_irq_init: out of memory"); goto failed; } - memset(irq_cpu_data[i].irq_delta, 0, sizeof(unsigned long) * NR_IRQS); - memset(irq_cpu_data[i].last_irq, 0, sizeof(unsigned long) * NR_IRQS); } printk(KERN_INFO "Starting balanced_irq\n"); @@ -2382,12 +2380,11 @@ static int __init ioapic_init_sysfs(void) for (i = 0; i < nr_ioapics; i++) { size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); if (!mp_ioapic_data[i]) { printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); continue; } - memset(mp_ioapic_data[i], 0, size); dev = &mp_ioapic_data[i]->dev; dev->id = i; dev->cls = &ioapic_sysdev_class; -- cgit v1.1 From 8b2ef1d7285740953a2c4ef7faf15fdfc5e2f358 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Sun, 8 Jun 2008 15:46:30 +0200 Subject: x86: add flags parameter to reserve_bootmem_generic() This patch adds a 'flags' parameter to reserve_bootmem_generic() like it already has been added in reserve_bootmem() with commit 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246. It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively change the behaviour. Since the change is x86-specific, I don't think it's necessary to add a new API for migration. There are only 4 users of that function. The change is necessary for the next patch, using reserve_bootmem_generic() for crashkernel reservation. Signed-off-by: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mpparse.c | 5 +++-- arch/x86/mm/init_64.c | 17 ++++++++++++----- include/asm-x86/proto.h | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 1cc7a4b..6ae6090 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -880,10 +880,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) reserve_bootmem_generic(mpf->mpf_physptr, - PAGE_SIZE); + PAGE_SIZE, BOOTMEM_DEFAULT); #endif return 1; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad9..bf7bf1d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) { #ifdef CONFIG_NUMA int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; + int ret; if (pfn >= end_pfn) { /* @@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * firmware tables: */ if (pfn < max_pfn_mapped) - return; + return -EFAULT; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); - return; + return -EFAULT; } /* Should check here against the e820 map to avoid double free */ @@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) nid = phys_to_nid(phys); next_nid = phys_to_nid(phys + len - 1); if (nid == next_nid) - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif @@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); } + + return 0; } int kern_addr_valid(unsigned long addr) diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h index 6c8b41b..a9f5147 100644 --- a/include/asm-x86/proto.h +++ b/include/asm-x86/proto.h @@ -14,7 +14,7 @@ extern void ia32_syscall(void); extern void ia32_cstar_target(void); extern void ia32_sysenter_target(void); -extern void reserve_bootmem_generic(unsigned long phys, unsigned len); +extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags); extern void syscall32_cpu_init(void); -- cgit v1.1 From 12448e3ef3ae7fca7a7ad205084f45a67dcc4356 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Sun, 8 Jun 2008 15:46:31 +0200 Subject: x86: use reserve_bootmem_generic() to reserve crashkernel memory on x86_64 This patch uses reserve_bootmem_generic() instead of reserve_bootmem() to reserve the crashkernel memory on x86_64. That's necessary for NUMA machines, see 00212fef814612245ed0261cbac8426d0c9a31a5: [PATCH] Fix kdump Crash Kernel boot memory reservation for NUMA machines This patch will fix a boot memory reservation bug that trashes memory on the ES7000 when loading the kdump crash kernel. The code in arch/x86_64/kernel/setup.c to reserve boot memory for the crash kernel uses the non-numa aware "reserve_bootmem" function instead of the NUMA aware "reserve_bootmem_generic". I checked to make sure that no other function was using "reserve_bootmem" and found none, except the ones that had NUMA ifdef'ed out. I have tested this patch only on an ES7000 with NUMA on and off (numa=off) in a single (non-NUMA) and multi-cell (NUMA) configurations. Signed-off-by: Amul Shah <amul.shah@unisys.com> Looks-good-to: Vivek Goyal <vgoyal@in.ibm.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> The switch-back to reserve_bootmem() was accidentally introduced in 5c3391f9f749023a49c64d607da4fb49263690eb when adding the BOOTMEM_EXCLUSIVE parameter. Signed-off-by: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 26d60cc..ec65696 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -244,7 +244,7 @@ static void __init reserve_crashkernel(void) return; } - if (reserve_bootmem(crash_base, crash_size, + if (reserve_bootmem_generic(crash_base, crash_size, BOOTMEM_EXCLUSIVE) < 0) { printk(KERN_INFO "crashkernel reservation failed - " "memory is in use\n"); -- cgit v1.1 From df5f6c212cc049d1989b5ce71bb863a367c261e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 10 Jun 2008 16:38:41 +0200 Subject: x86: unify the reserve_bootmem() behavior of early_res_to_bootmem() Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a706e90..68aba41 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -635,12 +635,8 @@ void __init early_res_to_bootmem(u64 start, u64 end) continue; printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, final_start, final_end - 1, r->name); -#ifdef CONFIG_X86_64 - reserve_bootmem_generic(final_start, final_end - final_start); -#else reserve_bootmem(final_start, final_end - final_start, BOOTMEM_DEFAULT); -#endif } } -- cgit v1.1 From ab4a465e96adf2f3a8aaa95384bacfa9ab661e35 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 10 Jun 2008 12:55:54 -0700 Subject: x86: e820 merge parsing of the mem=/memmap= boot parameters since we now have 32-bit support for e820_register_active_regions(), we can merge the parsing of the mem=/memmap= boot parameters. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 86 +++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 120 ---------------------------------------------- arch/x86/kernel/e820_64.c | 69 -------------------------- include/asm-x86/e820.h | 2 + include/asm-x86/e820_32.h | 2 - include/asm-x86/e820_64.h | 2 - 6 files changed, 88 insertions(+), 193 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 68aba41..4f2cd5d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -890,3 +890,89 @@ u64 __init e820_hole_size(u64 start, u64 end) } return end - start - ((u64)ram << PAGE_SHIFT); } + +static void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +/* "mem=nopentium" disables the 4MB page tables. */ +static int __init parse_memopt(char *p) +{ + u64 mem_size; + + if (!p) + return -EINVAL; + +#ifdef CONFIG_X86_32 + if (!strcmp(p, "nopentium")) { + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; + } +#endif + + mem_size = memparse(p, &p); + end_user_pfn = mem_size>>PAGE_SHIFT; + return 0; +} +early_param("mem", parse_memopt); + +static int userdef __initdata; + +static int __init parse_memmap_opt(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is + * reset. + */ + e820_register_active_regions(0, 0, -1UL); + saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); +#endif + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + end_user_pfn = (mem_size >> PAGE_SHIFT); + } + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void __init finish_e820_parsing(void) +{ + if (userdef) { + int nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } +} diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index e8a3b96..8de3df9 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -207,36 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource, } } -void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr; - int i; - - e820_print_map("limit_regions start"); - for (i = 0; i < e820.nr_map; i++) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr < size) - continue; - - if (e820.map[i].type != E820_RAM) - continue; - - if (e820.map[i].addr >= size) { - /* - * This region starts past the end of the - * requested size, skip it completely. - */ - e820.nr_map = i; - } else { - e820.nr_map = i + 1; - e820.map[i].size -= current_addr - size; - } - e820_print_map("limit_regions endfor"); - return; - } - e820_print_map("limit_regions endfunc"); -} - /* Overridden in paravirt.c if CONFIG_PARAVIRT */ char * __init __attribute__((weak)) memory_setup(void) { @@ -249,93 +219,3 @@ void __init setup_memory_map(void) e820_print_map(memory_setup()); } -static int __initdata user_defined_memmap; - -/* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to <mem>, overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * <start> to <start>+<mem>, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] - */ -static int __init parse_mem(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "nopentium") == 0) { - setup_clear_cpu_cap(X86_FEATURE_PSE); - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(arg, &arg); - limit_regions(mem_size); - user_defined_memmap = 1; - } - return 0; -} -early_param("mem", parse_mem); - -static int __init parse_memmap(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "exactmap") == 0) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); -#endif - e820.nr_map = 0; - user_defined_memmap = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(arg, &arg); - if (*arg == '@') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*arg == '#') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*arg == '$') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - user_defined_memmap = 1; - } - } - return 0; -} -early_param("memmap", parse_memmap); - -void __init finish_e820_parsing(void) -{ - if (user_defined_memmap) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } -} - diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 0afee2c..47952b1 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -96,75 +96,6 @@ char *__init machine_specific_memory_setup(void) return who; } -static int __init parse_memopt(char *p) -{ - if (!p) - return -EINVAL; - end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; - return 0; -} -early_param("mem", parse_memopt); - -static int userdef __initdata; - -static int __init parse_memmap_opt(char *p) -{ - char *oldp; - unsigned long long start_at, mem_size; - - if (!strcmp(p, "exactmap")) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real mem size before original memory map is - * reset. - */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); -#endif - e820.nr_map = 0; - userdef = 1; - return 0; - } - - oldp = p; - mem_size = memparse(p, &p); - if (p == oldp) - return -EINVAL; - - userdef = 1; - if (*p == '@') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*p == '#') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*p == '$') { - start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - end_user_pfn = (mem_size >> PAGE_SHIFT); - } - return *p == '\0' ? 0 : -EINVAL; -} -early_param("memmap", parse_memmap_opt); - -void __init finish_e820_parsing(void) -{ - if (userdef) { - int nr = e820.nr_map; - - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) - early_panic("Invalid user supplied memory map"); - e820.nr_map = nr; - - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } -} - int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) { int i; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 8aa3232..a5959e3 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -98,6 +98,8 @@ extern int e820_find_active_region(const struct e820entry *ei, extern void e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); extern u64 e820_hole_size(u64 start, u64 end); +extern void finish_e820_parsing(void); + #endif /* __ASSEMBLY__ */ #define ISA_START_ADDRESS 0xa0000 diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 212b74c..9135ce6 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -19,9 +19,7 @@ #ifndef __ASSEMBLY__ extern void setup_memory_map(void); -extern void finish_e820_parsing(void); -extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 368585d..478547c 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -22,8 +22,6 @@ extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); extern int is_memory_all_valid(unsigned long start, unsigned long end); -extern void finish_e820_parsing(void); - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ -- cgit v1.1 From b20d70b70e9aa854c47d2af10659f1033b6d69bb Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 9 Jun 2008 17:00:15 -0700 Subject: x86: make generic arch support NUMAQ, fix fix typo in bigsmp switching. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mach-generic/probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index ba18dec..5a7e461 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -64,7 +64,7 @@ early_param("apic", parse_apic); void __init generic_bigsmp_probe(void) { -#if CONFIG_X86_BIGSMP +#ifdef CONFIG_X86_BIGSMP /* * This routine is used to switch to bigsmp mode when * - There is no apic= option specified by the user -- cgit v1.1 From b1f006b65c12b85df81f12c1073ad18fd26f4a16 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 9 Jun 2008 18:11:36 -0700 Subject: x86: make generic arch support NUMAQ, fix #2 we are checking mptable early for numaq, so don't need to reserve_bootmem for it. bootmem is not there yet. do the same thing as 64-bit. found it on 64g above system from 64-bit kernel kexec to 32 bit kernel with numaq support. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/mpparse.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6ae6090..7ac1b68 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -853,9 +853,13 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, smp_found_config = 1; #endif mpf_found = mpf; -#ifdef CONFIG_X86_32 + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", mpf, virt_to_phys(mpf)); + + if (!reserve) + return 1; +#ifdef CONFIG_X86_32 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { @@ -877,9 +881,6 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, } #else - if (!reserve) - return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, BOOTMEM_DEFAULT); if (mpf->mpf_physptr) -- cgit v1.1 From d2dbf343329dc777d77488743465f7be4245971d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 13 Jun 2008 02:00:56 -0700 Subject: x86: clean up reserve_bootmem_generic() and port it to 32-bit 1. add reserve_bootmem_generic for 32bit 2. change len to unsigned long 3. make early_res_to_bootmem to use it Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 2 +- arch/x86/kernel/mpparse.c | 18 ++++++------------ arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/init_64.c | 3 ++- include/asm-x86/proto.h | 2 -- include/linux/bootmem.h | 2 ++ 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4f2cd5d..774063f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -635,7 +635,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) continue; printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, final_start, final_end - 1, r->name); - reserve_bootmem(final_start, final_end - final_start, + reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 7ac1b68..b62ac6b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -859,10 +859,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; -#ifdef CONFIG_X86_32 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { + unsigned long size = PAGE_SIZE; +#ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute * table size yet, as only few megabytes from @@ -872,22 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ - unsigned long size = PAGE_SIZE; unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size, +#endif + reserve_bootmem_generic(mpf->mpf_physptr, size, BOOTMEM_DEFAULT); } -#else - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, - BOOTMEM_DEFAULT); - if (mpf->mpf_physptr) - reserve_bootmem_generic(mpf->mpf_physptr, - PAGE_SIZE, BOOTMEM_DEFAULT); -#endif - return 1; + return 1; } bp += 4; length -= 16; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e7bb5e..abadb1d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -785,3 +785,9 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif + +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bf7bf1d..b8c2c1e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,7 +799,8 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) { #ifdef CONFIG_NUMA int nid, next_nid; diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h index a9f5147..3dd458c 100644 --- a/include/asm-x86/proto.h +++ b/include/asm-x86/proto.h @@ -14,8 +14,6 @@ extern void ia32_syscall(void); extern void ia32_cstar_target(void); extern void ia32_sysenter_target(void); -extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags); - extern void syscall32_cpu_init(void); extern void check_efer(void); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 686895b..a1d9b79 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -84,6 +84,8 @@ extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); __alloc_bootmem_low(x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, + int flags); extern unsigned long free_all_bootmem(void); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern void *__alloc_bootmem_node(pg_data_t *pgdat, -- cgit v1.1 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6..c3f119e 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<<PAGE_SHIFT, ((u64)size)<<PAGE_SHIFT, LARGE_PAGE_BYTES); - node_end_target -= LARGE_PAGE_BYTES; - } while (node_end_final == -1ULL && - (node_end_target>>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<<PAGE_SHIFT), - "KVA RAM"); - - node_end_pfn[nid] = node_end_final>>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<<PAGE_SHIFT), + "KVA RAM"); + + node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397..034a315 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7..d80e186 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.1 From 9a27f5c51629c3d3b7718dd4be3d2722b472fafe Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 13 Jun 2008 20:07:03 -0700 Subject: x86: clean up relocate_initrd 1. move that before zone_sizes_init ... 2. add free_early for one old one, otherwise it will be be reserved again when we init highmem. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 1d4be07..72b11d4 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -584,6 +584,9 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); + + /* need to free that, otherwise init highmem will reserve it again */ + free_early(ramdisk_image, ramdisk_image+ramdisk_size); } #endif /* CONFIG_BLK_DEV_INITRD */ @@ -801,10 +804,6 @@ void __init setup_arch(char **cmdline_p) init_ohci1394_dma_on_all_controllers(); #endif - remapped_pgdat_init(); - sparse_init(); - zone_sizes_init(); - /* * NOTE: at this point the bootmem allocator is fully available. */ @@ -813,6 +812,10 @@ void __init setup_arch(char **cmdline_p) relocate_initrd(); #endif + remapped_pgdat_init(); + sparse_init(); + zone_sizes_init(); + paravirt_post_allocator_init(); dmi_scan_machine(); -- cgit v1.1 From d867e5310bd3c560093d39669ef52ff7f1b5711a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 14 Jun 2008 01:26:41 -0700 Subject: x86: keep MP_intsrc_info untouched if we do not update mptable Daniel Exner reported IO-APIC enumeration breakage in linux-next. Alexey Starikovskiy found out that it might be related to commit 2944e16b25 "x86: update mptable". use enable_update_mptable to decide if need check before add mp_irqs array. Reported-by: Daniel Exner <webmaster@dragonslave.de> Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 3 +++ arch/x86/kernel/mpparse.c | 13 +++++++------ include/asm-x86/mpspec.h | 1 + 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index caf4ed7..4d370b1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1172,6 +1172,9 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, struct mpc_config_intsrc intsrc; int ioapic; + if (!enable_update_mptable) + return 0; + /* print the entry should happen on mptable identically */ intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqtype = mp_INT; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b62ac6b..014ac5d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -34,6 +34,8 @@ #include <mach_mpparse.h> #endif +int enable_update_mptable; + /* * Checksum an MP configuration block. */ @@ -295,10 +297,11 @@ void MP_intsrc_info(struct mpc_config_intsrc *m) print_MP_intsrc_info(m); - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) - return; - } + if (enable_update_mptable) + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); if (++mp_irq_entries == MAX_IRQ_SOURCES) @@ -1110,8 +1113,6 @@ out: return 0; } -int __initdata enable_update_mptable; - static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index b8ba374..f48dbca 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -61,6 +61,7 @@ extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); extern void MP_intsrc_info(struct mpc_config_intsrc *m); #ifdef CONFIG_X86_IO_APIC +extern int enable_update_mptable; extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); #else -- cgit v1.1 From 6df8809bbd1b48cc6d428df7372ed749c8711641 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 15 Jun 2008 18:19:46 -0700 Subject: x86: use dstapic in mp_config_acpi_legacy_irqs so we don't get the same value multiple times. also make mp_config_acpi_legacy_irqs more readable by moving assignments together. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4d370b1..29b365a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -962,8 +962,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { - int ioapic = -1; - int pin = -1; + int ioapic; + int pin; /* * Convert 'gsi' to 'ioapic.pin'. @@ -997,8 +997,9 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) void __init mp_config_acpi_legacy_irqs(void) { - int i = 0; - int ioapic = -1; + int i; + int ioapic; + unsigned int dstapic; #if defined (CONFIG_MCA) || defined (CONFIG_EISA) /* @@ -1023,6 +1024,7 @@ void __init mp_config_acpi_legacy_irqs(void) ioapic = mp_find_ioapic(0); if (ioapic < 0) return; + dstapic = mp_ioapics[ioapic].mp_apicid; /* * Use the default configuration for the IRQs 0-15. Unless @@ -1031,11 +1033,6 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = mp_ioapics[ioapic].mp_apicid; - for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; @@ -1045,9 +1042,8 @@ void __init mp_config_acpi_legacy_irqs(void) break; /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mp_dstapic == - mp_irqs[mp_irq_entries].mp_dstapic) && - (irq->mp_dstirq == i)) + if (irq->mp_dstapic == dstapic && + irq->mp_dstirq == i) break; } @@ -1056,8 +1052,12 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } + mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; + mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ + mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; + mp_irqs[mp_irq_entries].mp_dstapic = dstapic; mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ + mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ mp_irqs[mp_irq_entries].mp_dstirq = i; if (++mp_irq_entries == MAX_IRQ_SOURCES) -- cgit v1.1 From d0be6bdea103b8d04c8a3495538b7c0011ae4129 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 15 Jun 2008 18:58:51 -0700 Subject: x86: rename two e820 related functions rename update_memory_range to e820_update_range rename add_memory_region to e820_add_region to make it more clear that they are about e820 map operations. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/e820.c | 16 ++++++++-------- arch/x86/kernel/efi.c | 2 +- arch/x86/lguest/boot.c | 2 +- arch/x86/mach-default/setup.c | 4 ++-- arch/x86/mach-visws/setup.c | 6 +++--- arch/x86/mach-voyager/setup.c | 12 ++++++------ arch/x86/xen/setup.c | 4 ++-- include/asm-x86/e820.h | 4 ++-- 10 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 479926d..66b1409 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -292,7 +292,7 @@ void __init early_gart_iommu_check(void) E820_RAM)) { /* reserved it, so we can resuse it in second kernel */ printk(KERN_INFO "update e820 for GART\n"); - add_memory_region(aper_base, aper_size, E820_RESERVED); + e820_add_region(aper_base, aper_size, E820_RESERVED); update_e820(); } return; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 0642201..105afe1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1440,7 +1440,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return update_memory_range(trim_start, trim_size, E820_RAM, + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } /** diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 774063f..5051ce7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -94,7 +94,7 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) /* * Add a memory region to the kernel e820 map. */ -void __init add_memory_region(u64 start, u64 size, int type) +void __init e820_add_region(u64 start, u64 size, int type) { int x = e820.nr_map; @@ -384,12 +384,12 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map) if (start > end) return -1; - add_memory_region(start, size, type); + e820_add_region(start, size, type); } while (biosmap++, --nr_map); return 0; } -u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { int i; @@ -414,7 +414,7 @@ u64 __init update_memory_range(u64 start, u64 size, unsigned old_type, final_end = min(start + size, ei->addr + ei->size); if (final_start >= final_end) continue; - add_memory_region(final_start, final_end - final_start, + e820_add_region(final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; } @@ -774,7 +774,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) return 0; addr = round_down(start + size - sizet, align); - update_memory_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); update_e820(); @@ -949,13 +949,13 @@ static int __init parse_memmap_opt(char *p) userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RAM); + e820_add_region(start_at, mem_size, E820_RAM); } else if (*p == '#') { start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_ACPI); + e820_add_region(start_at, mem_size, E820_ACPI); } else if (*p == '$') { start_at = memparse(p+1, &p); - add_memory_region(start_at, mem_size, E820_RESERVED); + e820_add_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index d5c7fcd..473c89f 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -233,7 +233,7 @@ static void __init add_efi_memmap(void) e820_type = E820_RAM; else e820_type = E820_RESERVED; - add_memory_region(start, size, e820_type); + e820_add_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5e47729..e72cf07 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -835,7 +835,7 @@ static __init char *lguest_memory_setup(void) /* The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ - add_memory_region(boot_params.e820_map[0].addr, + e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 56b4c39..7ae692a 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -184,8 +184,8 @@ char * __init machine_specific_memory_setup(void) } e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } return who; } diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c index de4c9db..d67868e 100644 --- a/arch/x86/mach-visws/setup.c +++ b/arch/x86/mach-visws/setup.c @@ -175,9 +175,9 @@ char * __init machine_specific_memory_setup(void) sgivwfb_mem_size &= ~((1 << 20) - 1); sgivwfb_mem_phys = mem_size - gfx_mem_size; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); - add_memory_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); + e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); return "PROM"; } diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index f4aca9f..f27b583 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -74,7 +74,7 @@ char *__init machine_specific_memory_setup(void) e820.nr_map = 0; for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { - add_memory_region(addr, length, E820_RAM); + e820_add_region(addr, length, E820_RAM); } return who; } else if (voyager_level == 4) { @@ -92,14 +92,14 @@ char *__init machine_specific_memory_setup(void) tom = (boot_params.screen_info.ext_mem_k) << 10; } who = "Voyager-TOM"; - add_memory_region(0, 0x9f000, E820_RAM); + e820_add_region(0, 0x9f000, E820_RAM); /* map from 1M to top of memory */ - add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, + e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, E820_RAM); /* FIXME: Should check the ASICs to see if I need to * take out the 8M window. Just do it at the moment * */ - add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024, + e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024, E820_RESERVED); return who; } @@ -131,8 +131,8 @@ char *__init machine_specific_memory_setup(void) } e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } return who; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 82517e4..9001c9d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -39,8 +39,8 @@ char * __init xen_memory_setup(void) unsigned long max_pfn = xen_start_info->nr_pages; e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); return "Xen"; } diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index a5959e3..6b0ce74 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -60,12 +60,12 @@ extern struct e820map e820; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); -extern void add_memory_region(u64 start, u64 size, int type); +extern void e820_add_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); -extern u64 update_memory_range(u64 start, u64 size, unsigned old_type, +extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void update_e820(void); extern void e820_setup_gap(void); -- cgit v1.1 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce7..ed46b7a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e..7c4d025 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1d..ba07a48 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce74..55d3105 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b..85c4fea 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a315..e4de460 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e186..41c6e3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.1 From 8c5beb50d3ec915d15c4d38aa37282309a65f14e Mon Sep 17 00:00:00 2001 From: "Huang, Ying" <ying.huang@intel.com> Date: Wed, 11 Jun 2008 11:33:39 +0800 Subject: x86 boot: pass E820 memory map entries more than 128 via linked list of setup data Because of the size limits of struct boot_params (zero page), the maximum number of E820 memory map entries can be passed to kernel is 128. As pointed by Paul Jackson, there is some machine produced by SGI with so many nodes that the number of E820 memory map entries is more than 128. To enabling Linux kernel on these system, a new setup data type named SETUP_E820_EXT is defined to pass additional memory map entries to Linux kernel. This patch is based on x86/auto-latest branch of git-x86 tree and has been tested on x86_64 and i386 platform. Signed-off-by: Huang Ying <ying.huang@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 59 +++++++++++++++++++++++++++++++++++---------- arch/x86/kernel/setup.c | 3 +++ include/asm-x86/bootparam.h | 1 + include/asm-x86/e820.h | 2 ++ 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed46b7a..544dd12 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -359,6 +359,26 @@ static struct e820entry new_bios[E820_X_MAX] __initdata; return 0; } +static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map) +{ + while (nr_map) { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + e820_add_region(start, size, type); + + biosmap++; + nr_map--; + } + return 0; +} + /* * Copy the BIOS e820 map into a safe place. * @@ -374,19 +394,7 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map) if (nr_map < 2) return -1; - do { - u64 start = biosmap->addr; - u64 size = biosmap->size; - u64 end = start + size; - u32 type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - e820_add_region(start, size, type); - } while (biosmap++, --nr_map); - return 0; + return __copy_e820_map(biosmap, nr_map); } u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, @@ -496,6 +504,31 @@ __init void e820_setup_gap(void) pci_mem_start, gapstart, gapsize); } +/** + * Because of the size limitation of struct boot_params, only first + * 128 E820 memory entries are passed to kernel via + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of + * linked list of struct setup_data, which is parsed here. + */ +void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +{ + u32 map_len; + int entries; + struct e820entry *extmap; + + entries = sdata->len / sizeof(struct e820entry); + map_len = sdata->len + sizeof(struct setup_data); + if (map_len > PAGE_SIZE) + sdata = early_ioremap(pa_data, map_len); + extmap = (struct e820entry *)(sdata->data); + __copy_e820_map(extmap, entries); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + if (map_len > PAGE_SIZE) + early_iounmap(sdata, map_len); + printk(KERN_INFO "extended physical RAM map:\n"); + e820_print_map("extended"); +} + #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) /** diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 45a5e24..5b0de38 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -150,6 +150,9 @@ void __init parse_setup_data(void) while (pa_data) { data = early_ioremap(pa_data, PAGE_SIZE); switch (data->type) { + case SETUP_E820_EXT: + parse_e820_ext(data, pa_data); + break; default: break; } diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index 0a07390..876f211 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -11,6 +11,7 @@ /* setup data types */ #define SETUP_NONE 0 +#define SETUP_E820_EXT 1 /* extensible setup data list node */ struct setup_data { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 55d3105..77fc24d 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -69,6 +69,8 @@ extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern void update_e820(void); extern void e820_setup_gap(void); +struct setup_data; +extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); #if defined(CONFIG_X86_64) || \ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) -- cgit v1.1 From 41c094fd3ca54f1a71233049cf136ff94c91f4ae Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 16 Jun 2008 13:03:31 -0700 Subject: x86: move e820_resource_resources to e820.c and make 32-bit resource registration more like 64 bit. also move probe_roms back to setup_32.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 32 +++++++ arch/x86/kernel/e820_32.c | 192 ----------------------------------------- arch/x86/kernel/e820_64.c | 24 ------ arch/x86/kernel/setup_32.c | 208 ++++++++++++++++++++++++++++++++++++++------- include/asm-x86/e820.h | 1 + include/asm-x86/e820_32.h | 4 - include/asm-x86/e820_64.h | 1 - 7 files changed, 208 insertions(+), 254 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 544dd12..432c491 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -998,3 +998,35 @@ void __init finish_e820_parsing(void) e820_print_map("user"); } } + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +void __init e820_reserve_resources(void) +{ + int i; + struct resource *res; + + res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + for (i = 0; i < e820.nr_map; i++) { + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; +#ifndef CONFIG_RESOURCES_64BIT + if (res->end > 0x100000000ULL) { + res++; + continue; + } +#endif + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + res++; + } +} + + diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 8de3df9..1205ccf 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -15,198 +15,6 @@ #include <asm/e820.h> #include <asm/setup.h> -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -#define ROMSIGNATURE 0xaa55 - -static int __init romsignature(const unsigned char *rom) -{ - const unsigned short * const ptr = (const unsigned short *)rom; - unsigned short sig; - - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; -} - -static int __init romchecksum(const unsigned char *rom, unsigned long length) -{ - unsigned char sum, c; - - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) - sum += c; - return !length && !sum; -} - -static void __init probe_roms(void) -{ - const unsigned char *rom; - unsigned long start, length, upper; - unsigned char c; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - -/* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ -void __init init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) -{ - int i; - - probe_roms(); - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; -#ifndef CONFIG_RESOURCES_64BIT - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; -#endif - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res)) { - kfree(res); - continue; - } - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, code_resource); - request_resource(res, data_resource); - request_resource(res, bss_resource); -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - request_resource(res, &crashk_res); -#endif - } - } -} - /* Overridden in paravirt.c if CONFIG_PARAVIRT */ char * __init __attribute__((weak)) memory_setup(void) { diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 47952b1..cb03bff 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -41,30 +41,6 @@ unsigned long end_pfn; */ unsigned long max_pfn_mapped; -/* - * Mark e820 reserved areas as busy for the resource manager. - */ -void __init e820_reserve_resources(void) -{ - int i; - struct resource *res; - - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); - for (i = 0; i < e820.nr_map; i++) { - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - res++; - } -} - static void early_panic(char *msg) { early_printk(msg); diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 72b11d4..f3ddba5 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -445,25 +445,28 @@ static void __init reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret == 0 && crash_size > 0) { - if (crash_base > 0) { - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - if (reserve_bootmem(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation " - "failed - memory is in use\n"); - return; - } - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - } else + if (crash_base <= 0) { printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); } } #else @@ -675,6 +678,8 @@ int x86_cpu_to_node_map_init[NR_CPUS] = { DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; #endif +static void probe_roms(void); + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -684,6 +689,7 @@ DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; */ void __init setup_arch(char **cmdline_p) { + int i; unsigned long max_low_pfn; memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -745,6 +751,13 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + probe_roms(); + + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; @@ -861,9 +874,16 @@ void __init setup_arch(char **cmdline_p) "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); #endif - e820_setup_gap(); + e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); + request_resource(&iomem_resource, &video_ram_resource); + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + + e820_setup_gap(); + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) @@ -874,25 +894,147 @@ void __init setup_arch(char **cmdline_p) #endif } -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +static void __init probe_roms(void) { + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; int i; - printk(KERN_INFO "Setting up standard PCI resources\n"); - init_iomem_resources(&code_resource, &data_resource, &bss_resource); + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; - request_resource(&iomem_resource, &video_ram_resource); + video_rom_resource.start = start; - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } } -subsys_initcall(request_standard_resources); diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 77fc24d..e860fe7 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -100,6 +100,7 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); extern u64 e820_hole_size(u64 start, u64 end); extern void finish_e820_parsing(void); +extern void e820_reserve_resources(void); #endif /* __ASSEMBLY__ */ diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 9135ce6..557b890 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -20,9 +20,5 @@ extern void setup_memory_map(void); -extern void init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource); - #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 478547c..8d99210 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -16,7 +16,6 @@ #ifndef __ASSEMBLY__ extern void setup_memory_region(void); extern void contig_e820_setup(void); -extern void e820_reserve_resources(void); extern int e820_any_non_reserved(unsigned long start, unsigned long end); extern int is_memory_any_valid(unsigned long start, unsigned long end); extern int e820_all_non_reserved(unsigned long start, unsigned long end); -- cgit v1.1 From cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 16 Jun 2008 16:11:08 -0700 Subject: x86: kill bad_ppro so don't punish all other cpus without that problem when init highmem Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 9 +++++++++ arch/x86/mm/discontig_32.c | 4 ++-- arch/x86/mm/init_32.c | 43 ++++++++++++------------------------------- include/asm-x86/highmem.h | 2 +- include/asm-x86/numa_32.h | 2 +- 5 files changed, 25 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index f3ddba5..9692aeb 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -68,6 +68,7 @@ #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/efi.h> +#include <asm/bugs.h> /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* @@ -764,6 +765,14 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); + if (ppro_with_ram_bug()) { + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, + E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820_print_map("bad_ppro"); + } + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7c4d025..6216e43 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -427,7 +427,7 @@ void __init zone_sizes_init(void) return; } -void __init set_highmem_pages_init(int bad_ppro) +void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM struct zone *zone; @@ -447,7 +447,7 @@ void __init set_highmem_pages_init(int bad_ppro) zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn, bad_ppro); + zone_end_pfn); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ba07a48..fb5694d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -220,13 +220,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) } } -static inline int page_kills_ppro(unsigned long pagenr) -{ - if (pagenr >= 0x70000 && pagenr <= 0x7003F) - return 1; - return 0; -} - /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. @@ -287,22 +280,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init -add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init add_one_highpage_init(struct page *page, int pfn) { - if (!(bad_ppro && page_kills_ppro(pfn))) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; - } else - SetPageReserved(page); + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } struct add_highpages_data { unsigned long start_pfn; unsigned long end_pfn; - int bad_ppro; }; static void __init add_highpages_work_fn(unsigned long start_pfn, @@ -312,10 +300,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, struct page *page; unsigned long final_start_pfn, final_end_pfn; struct add_highpages_data *data; - int bad_ppro; data = (struct add_highpages_data *)datax; - bad_ppro = data->bad_ppro; final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); @@ -327,29 +313,26 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn); } } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn, - int bad_ppro) + unsigned long end_pfn) { struct add_highpages_data data; data.start_pfn = start_pfn; data.end_pfn = end_pfn; - data.bad_ppro = bad_ppro; work_with_active_regions(nid, add_highpages_work_fn, &data); } #ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(int bad_ppro) +static void __init set_highmem_pages_init(void) { - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, - bad_ppro); + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); totalram_pages += totalhigh_pages; } @@ -358,7 +341,7 @@ static void __init set_highmem_pages_init(int bad_ppro) #else # define kmap_init() do { } while (0) # define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init(bad_ppro) do { } while (0) +# define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ pteval_t __PAGE_KERNEL = _PAGE_KERNEL; @@ -605,13 +588,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; - int tmp, bad_ppro; + int tmp; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); - #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { @@ -634,7 +615,7 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(bad_ppro); + set_highmem_pages_init(); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index 85c4fea..4514b16 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -75,7 +75,7 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn, int bad_ppro); + unsigned long end_pfn); #endif /* __KERNEL__ */ diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h index 03d0f7a..a02674f 100644 --- a/include/asm-x86/numa_32.h +++ b/include/asm-x86/numa_32.h @@ -5,7 +5,7 @@ extern int pxm_to_nid(int pxm); #ifdef CONFIG_NUMA extern void __init remap_numa_kva(void); -extern void set_highmem_pages_init(int); +extern void set_highmem_pages_init(void); #else static inline void remap_numa_kva(void) { -- cgit v1.1 From 064d25f12014ae1d97c2882f9ab874995321f2b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 16 Jun 2008 19:58:28 -0700 Subject: x86: merge setup_memory_map with e820 ... and kill e820_32/64.c and e820_32/64.h Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 72 +++++++++++++++++++++++++++++++++ arch/x86/kernel/e820_32.c | 29 -------------- arch/x86/kernel/e820_64.c | 92 ------------------------------------------- arch/x86/kernel/setup_64.c | 8 +--- arch/x86/mach-default/setup.c | 47 ---------------------- arch/x86/mm/init_64.c | 12 ++++++ include/asm-x86/e820.h | 9 +++-- include/asm-x86/e820_32.h | 24 ----------- include/asm-x86/e820_64.h | 26 ------------ include/asm-x86/setup.h | 5 --- 11 files changed, 92 insertions(+), 234 deletions(-) delete mode 100644 arch/x86/kernel/e820_32.c delete mode 100644 arch/x86/kernel/e820_64.c delete mode 100644 include/asm-x86/e820_32.h delete mode 100644 include/asm-x86/e820_64.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dc3c636..bcc2b12 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o -obj-y += bootflag.o e820_$(BITS).o e820.o +obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 432c491..4947748 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1029,4 +1029,76 @@ void __init e820_reserve_resources(void) } } +char *__init __attribute__((weak)) machine_specific_memory_setup(void) +{ + char *who = "BIOS-e820"; + int new_nr; + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + new_nr = boot_params.e820_entries; + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &new_nr); + boot_params.e820_entries = new_nr; + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { +#ifdef CONFIG_X86_64 + early_panic("Cannot find a valid memory map"); +#else + unsigned long mem_size; + + /* compare results from other methods and take the greater */ + if (boot_params.alt_mem_k + < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; + who = "BIOS-88"; + } else { + mem_size = boot_params.alt_mem_k; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); +#endif + } + + /* In case someone cares... */ + return who; +} + +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __init __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + +void __init setup_memory_map(void) +{ + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map(memory_setup()); +} + +#ifdef CONFIG_X86_64 +int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) +{ + int i; + if (slot < 0 || slot >= e820.nr_map) + return -1; + for (i = slot; i < e820.nr_map; i++) { + if (e820.map[i].type != E820_RAM) + continue; + break; + } + if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) + return -1; + *addr = e820.map[i].addr; + *size = min_t(u64, e820.map[i].size + e820.map[i].addr, + max_pfn << PAGE_SHIFT) - *addr; + return i + 1; +} +#endif diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c deleted file mode 100644 index 1205ccf..0000000 --- a/arch/x86/kernel/e820_32.c +++ /dev/null @@ -1,29 +0,0 @@ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/pfn.h> -#include <linux/uaccess.h> - -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/setup.h> - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - -void __init setup_memory_map(void) -{ - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(memory_setup()); -} - diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c deleted file mode 100644 index cb03bff..0000000 --- a/arch/x86/kernel/e820_64.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Handle the memory map. - * The functions here do the job until bootmem takes over. - * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/pfn.h> -#include <linux/pci.h> - -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/setup.h> -#include <asm/sections.h> -#include <asm/kdebug.h> -#include <asm/trampoline.h> - -/* - * PFN of last memory page. - */ -unsigned long end_pfn; - -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_pfn_mapped; - -static void early_panic(char *msg) -{ - early_printk(msg); - panic(msg); -} - -/* We're not void only for x86 32-bit compat */ -char *__init machine_specific_memory_setup(void) -{ - char *who = "BIOS-e820"; - int new_nr; - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) - early_panic("Cannot find a valid memory map"); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); - - /* In case someone cares... */ - return who; -} - -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index ec65696..3220c7b 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -266,12 +266,6 @@ static inline void __init reserve_crashkernel(void) {} #endif -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -void __attribute__((weak)) __init memory_setup(void) -{ - machine_specific_memory_setup(); -} - #ifdef CONFIG_PCI_MMCONFIG extern void __cpuinit fam10h_check_enable_mmcfg(void); extern void __init check_enable_amd_mmconf_dmi(void); @@ -316,7 +310,7 @@ void __init setup_arch(char **cmdline_p) ARCH_SETUP - memory_setup(); + setup_memory_map(); copy_edd(); if (!boot_params.hdr.root_flags) diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 7ae692a..2f5e277 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -142,50 +142,3 @@ static int __init print_ipi_mode(void) late_initcall(print_ipi_mode); -/** - * machine_specific_memory_setup - Hook for machine specific memory setup. - * - * Description: - * This is included late in kernel/setup.c so that it can make - * use of all of the static functions. - **/ - -char * __init machine_specific_memory_setup(void) -{ - char *who; - int new_nr; - - - who = "BIOS-e820"; - - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { - mem_size = boot_params.screen_info.ext_mem_k; - who = "BIOS-88"; - } else { - mem_size = boot_params.alt_mem_k; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - return who; -} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b8c2c1e..5266f31 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -47,6 +47,18 @@ #include <asm/numa.h> #include <asm/cacheflush.h> +/* + * PFN of last memory page. + */ +unsigned long end_pfn; + +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_pfn_mapped; + static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index e860fe7..83144cb 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -101,6 +101,9 @@ extern void e820_register_active_regions(int nid, unsigned long start_pfn, extern u64 e820_hole_size(u64 start, u64 end); extern void finish_e820_parsing(void); extern void e820_reserve_resources(void); +extern void setup_memory_map(void); +extern char *machine_specific_memory_setup(void); +extern char *memory_setup(void); #endif /* __ASSEMBLY__ */ @@ -111,10 +114,10 @@ extern void e820_reserve_resources(void); #define BIOS_END 0x00100000 #ifdef __KERNEL__ +#include <linux/ioport.h> + #ifdef CONFIG_X86_32 -# include "e820_32.h" -#else -# include "e820_64.h" +#define HIGH_MEMORY (1024*1024) #endif #endif /* __KERNEL__ */ diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h deleted file mode 100644 index 557b890..0000000 --- a/include/asm-x86/e820_32.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * structures and definitions for the int 15, ax=e820 memory map - * scheme. - * - * In a nutshell, arch/i386/boot/setup.S populates a scratch table - * in the empty_zero_block that contains a list of usable address/size - * duples. In arch/i386/kernel/setup.c, this information is - * transferred into the e820map, and in arch/i386/mm/init.c, that - * new information is used to mark pages reserved or not. - * - */ -#ifndef __E820_HEADER -#define __E820_HEADER - -#include <linux/ioport.h> - -#define HIGH_MEMORY (1024*1024) - -#ifndef __ASSEMBLY__ - -extern void setup_memory_map(void); - -#endif/*!__ASSEMBLY__*/ -#endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h deleted file mode 100644 index 8d99210..0000000 --- a/include/asm-x86/e820_64.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * structures and definitions for the int 15, ax=e820 memory map - * scheme. - * - * In a nutshell, setup.S populates a scratch table in the - * empty_zero_block that contains a list of usable address/size - * duples. setup.c, this information is transferred into the e820map, - * and in init.c/numa.c, that new information is used to mark pages - * reserved or not. - */ -#ifndef __E820_HEADER -#define __E820_HEADER - -#include <linux/ioport.h> - -#ifndef __ASSEMBLY__ -extern void setup_memory_region(void); -extern void contig_e820_setup(void); -extern int e820_any_non_reserved(unsigned long start, unsigned long end); -extern int is_memory_any_valid(unsigned long start, unsigned long end); -extern int e820_all_non_reserved(unsigned long start, unsigned long end); -extern int is_memory_all_valid(unsigned long start, unsigned long end); - -#endif/*!__ASSEMBLY__*/ - -#endif/*__E820_HEADER*/ diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 9e163fc..b676b0b 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -8,7 +8,6 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); -char *machine_specific_memory_setup(void); #ifndef CONFIG_PARAVIRT #define paravirt_post_allocator_init() do {} while (0) #endif @@ -50,10 +49,6 @@ extern struct boot_params boot_params; */ #define LOWMEMSIZE() (0x9f000) -char * __init machine_specific_memory_setup(void); -char *memory_setup(void); - - void __init i386_start_kernel(void); extern unsigned long init_pg_tables_start; -- cgit v1.1 From 593a0cc39046515a438ca9fcc168115961f9f503 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 17 Jun 2008 10:02:45 -0700 Subject: x86: move some function out of setup_bootmem_alloc ... to make it more like 64-bit. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9692aeb..90a2b85 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -623,21 +623,6 @@ void __init setup_bootmem_allocator(void) free_bootmem_with_active_regions(i, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif -#ifdef CONFIG_X86_FIND_SMP_CONFIG - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif - reserve_crashkernel(); - - reserve_ibft_region(); } /* @@ -792,6 +777,22 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = setup_memory(); +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#ifdef CONFIG_X86_FIND_SMP_CONFIG + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + reserve_crashkernel(); + + reserve_ibft_region(); + #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif -- cgit v1.1 From 0699eae140a3eeca976df4e3b7699b1fa3f763cd Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 17 Jun 2008 15:39:01 -0700 Subject: x86: Kconfig cleanup with genericarch we already have summit and etc depends on genericarch, so use genericarch only. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07276ac..7ebd986 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -446,7 +446,7 @@ config MEMTEST_BOOTPARAM_VALUE config ACPI_SRAT def_bool y - depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) + depends on X86_32 && ACPI && NUMA && X86_GENERICARCH select ACPI_NUMA config HAVE_ARCH_PARSE_SRAT @@ -455,11 +455,11 @@ config HAVE_ARCH_PARSE_SRAT config X86_SUMMIT_NUMA def_bool y - depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) + depends on X86_32 && NUMA && X86_GENERICARCH config X86_CYCLONE_TIMER def_bool y - depends on X86_32 && X86_SUMMIT || X86_GENERICARCH + depends on X86_GENERICARCH config ES7000_CLUSTERED_APIC def_bool y @@ -909,9 +909,9 @@ config X86_PAE config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || X86_SUMMIT && ACPI) && EXPERIMENTAL) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC - default y if (X86_NUMAQ || X86_SUMMIT || X86_GENERICARCH) + default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. The kernel will try to allocate memory used by a CPU on the -- cgit v1.1 From 1c6e55032e24ff79668581a0f296c278ef7edd4e Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 17 Jun 2008 15:41:45 -0700 Subject: x86: use acpi_numa_init to parse on 32-bit numa seperate SRAT finding and parsing from get_memcfg_from_srat, and let getmemcfg_from_srat only handle array from previous step. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 36 +++++--- arch/x86/kernel/srat_32.c | 200 +++++++++++---------------------------------- include/asm-x86/acpi.h | 4 +- include/asm-x86/dmi.h | 8 -- 4 files changed, 75 insertions(+), 173 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 90a2b85..7e06ecd 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -59,6 +59,7 @@ #include <asm/setup.h> #include <asm/arch_hooks.h> #include <asm/sections.h> +#include <asm/dmi.h> #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> @@ -185,6 +186,12 @@ int bootloader_type; static unsigned int highmem_pages = -1; /* + * Early DMI memory + */ +int dmi_alloc_index; +char dmi_alloc_data[DMI_MAX_DATA]; + +/* * Setup options */ struct screen_info screen_info; @@ -775,6 +782,24 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820_end_of_ram(); } + dmi_scan_machine(); + + io_delay_init(); + +#ifdef CONFIG_ACPI + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); +#endif + +#ifdef CONFIG_ACPI_NUMA + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); +#endif + max_low_pfn = setup_memory(); #ifdef CONFIG_ACPI_SLEEP @@ -841,10 +866,6 @@ void __init setup_arch(char **cmdline_p) paravirt_post_allocator_init(); - dmi_scan_machine(); - - io_delay_init(); - #ifdef CONFIG_X86_SMP /* * setup to use the early static init tables during kernel startup @@ -861,13 +882,6 @@ void __init setup_arch(char **cmdline_p) generic_apic_probe(); #endif -#ifdef CONFIG_ACPI - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); -#endif - early_quirks(); #ifdef CONFIG_ACPI diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index e9d9172..5978023 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -42,7 +42,7 @@ #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) /* bitmap length; _PXM is at most 255 */ #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ +static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ #define MAX_CHUNKS_PER_NODE 3 #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) @@ -53,16 +53,37 @@ struct node_memory_chunk_s { u8 nid; // which cnode contains this chunk? u8 bank; // which mem bank on this node }; -static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; +static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; -static int num_memory_chunks; /* total number of memory chunks */ +static int __initdata num_memory_chunks; /* total number of memory chunks */ static u8 __initdata apicid_to_pxm[MAX_APICID]; +int numa_off __initdata; +int acpi_numa __initdata; + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + num_memory_chunks = 0; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + /* Identify CPU proximity domains */ -static void __init parse_cpu_affinity_structure(char *p) +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) { - struct acpi_srat_cpu_affinity *cpu_affinity = - (struct acpi_srat_cpu_affinity *) p; + if (srat_disabled()) + return; + if (cpu_affinity->header.length != + sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; /* empty entry */ @@ -80,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p) * Identify memory proximity domains and hot-remove capabilities. * Fill node memory chunk list structure. */ -static void __init parse_memory_affinity_structure (char *sratp) +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) { unsigned long long paddr, size; unsigned long start_pfn, end_pfn; u8 pxm; struct node_memory_chunk_s *p, *q, *pend; - struct acpi_srat_mem_affinity *memory_affinity = - (struct acpi_srat_mem_affinity *) sratp; + + if (srat_disabled()) + return; + if (memory_affinity->header.length != + sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) return; /* empty entry */ @@ -135,6 +163,14 @@ static void __init parse_memory_affinity_structure (char *sratp) "enabled and removable" : "enabled" ) ); } +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} /* * The SRAT table always lists ascending addresses, so can always * assume that the first "start" address that you see is the real @@ -167,39 +203,13 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c node_end_pfn[nid] = memory_chunk->end_pfn; } -/* Parse the ACPI Static Resource Affinity Table */ -static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) +int __init get_memcfg_from_srat(void) { - u8 *start, *end, *p; int i, j, nid; - start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ - p = start; - end = (u8 *)sratp + sratp->header.length; - memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ - memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); - - num_memory_chunks = 0; - while (p < end) { - switch (*p) { - case ACPI_SRAT_TYPE_CPU_AFFINITY: - parse_cpu_affinity_structure(p); - break; - case ACPI_SRAT_TYPE_MEMORY_AFFINITY: - parse_memory_affinity_structure(p); - break; - default: - printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); - break; - } - p += p[1]; - if (p[1] == 0) { - printk("acpi20_parse_srat: Entry length value is zero;" - " can't parse any further!\n"); - break; - } - } + if (srat_disabled()) + goto out_fail; if (num_memory_chunks == 0) { printk("could not finy any ACPI SRAT memory areas.\n"); @@ -248,7 +258,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } - + for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; unsigned long end = min(node_end_pfn[nid], max_pfn); @@ -258,118 +268,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) } return 1; out_fail: - return 0; -} - -struct acpi_static_rsdt { - struct acpi_table_rsdt table; - u32 padding[32]; /* Allow for 32 more table entries */ -}; - -int __init get_memcfg_from_srat(void) -{ - struct acpi_table_header *header = NULL; - struct acpi_table_rsdp *rsdp = NULL; - struct acpi_table_rsdt *rsdt = NULL; - acpi_native_uint rsdp_address = 0; - struct acpi_static_rsdt saved_rsdt; - int tables = 0; - int i = 0; - - rsdp_address = acpi_os_get_root_pointer(); - if (!rsdp_address) { - printk("%s: System description tables not found\n", - __func__); - goto out_err; - } - - printk("%s: assigning address to rsdp\n", __func__); - rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address; - if (!rsdp) { - printk("%s: Didn't find ACPI root!\n", __func__); - goto out_err; - } - - printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, - rsdp->oem_id); - - if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) { - printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__); - goto out_err; - } - - rsdt = (struct acpi_table_rsdt *) - early_ioremap(rsdp->rsdt_physical_address, sizeof(saved_rsdt)); - - if (!rsdt) { - printk(KERN_WARNING - "%s: ACPI: Invalid root system description tables (RSDT)\n", - __func__); - goto out_err; - } - - header = &rsdt->header; - - if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) { - printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); - early_iounmap(rsdt, sizeof(saved_rsdt)); - goto out_err; - } - - /* - * The number of tables is computed by taking the - * size of all entries (header size minus total - * size of RSDT) divided by the size of each entry - * (4-byte table pointers). - */ - tables = (header->length - sizeof(struct acpi_table_header)) / sizeof(u32); - - if (!tables) - goto out_err; - - memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); - early_iounmap(rsdt, sizeof(saved_rsdt)); - if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) { - printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", - saved_rsdt.table.header.length); - goto out_err; - } - - printk("Begin SRAT table scan....%d\n", tables); - - for (i = 0; i < tables; i++){ - int result; - u32 length; - /* Map in header, then map in full table length. */ - header = (struct acpi_table_header *) - early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); - if (!header) - break; - - printk(KERN_INFO "ACPI: %4.4s %08lX, %04X\n", - header->signature, - (unsigned long)saved_rsdt.table.table_offset_entry[i], - header->length); - - if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4)) { - early_iounmap(header, sizeof(struct acpi_table_header)); - continue; - } - - length = header->length; - early_iounmap(header, sizeof(struct acpi_table_header)); - header = (struct acpi_table_header *) - early_ioremap(saved_rsdt.table.table_offset_entry[i], length); - if (!header) - break; - - /* we've found the srat table. don't need to look at any more tables */ - result = acpi20_parse_srat((struct acpi_table_srat *)header); - early_iounmap(header, length); - return result; - } -out_err: - remove_all_active_ranges(); printk("failed to get NUMA memory information from SRAT table\n"); return 0; } diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h index 73ce5b3..635d764 100644 --- a/include/asm-x86/acpi.h +++ b/include/asm-x86/acpi.h @@ -161,9 +161,7 @@ struct bootnode; #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int acpi_scan_nodes(unsigned long start, unsigned long end); -#ifdef CONFIG_X86_64 -# define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -#endif +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) extern void acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes); #else diff --git a/include/asm-x86/dmi.h b/include/asm-x86/dmi.h index 4edf751..58a8657 100644 --- a/include/asm-x86/dmi.h +++ b/include/asm-x86/dmi.h @@ -3,12 +3,6 @@ #include <asm/io.h> -#ifdef CONFIG_X86_32 - -#define dmi_alloc alloc_bootmem - -#else /* CONFIG_X86_32 */ - #define DMI_MAX_DATA 2048 extern int dmi_alloc_index; @@ -25,8 +19,6 @@ static inline void *dmi_alloc(unsigned len) return dmi_alloc_data + idx; } -#endif - /* Use early IO mappings for DMI because it's initialized early */ #define dmi_ioremap early_ioremap #define dmi_iounmap early_iounmap -- cgit v1.1 From 66a6f8d539416c9664f3a04ecb12f55a0097ab8c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 18 Jun 2008 11:54:37 -0700 Subject: x86: remove unused file after numaq etc depends on genericarch we don't need those mach_mpspec.h files now. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mach-bigsmp/mach_mpspec.h | 8 -------- include/asm-x86/mach-es7000/mach_mpspec.h | 8 -------- include/asm-x86/mach-numaq/mach_mpspec.h | 8 -------- include/asm-x86/mach-summit/mach_mpspec.h | 9 --------- 4 files changed, 33 deletions(-) delete mode 100644 include/asm-x86/mach-bigsmp/mach_mpspec.h delete mode 100644 include/asm-x86/mach-es7000/mach_mpspec.h delete mode 100644 include/asm-x86/mach-numaq/mach_mpspec.h delete mode 100644 include/asm-x86/mach-summit/mach_mpspec.h diff --git a/include/asm-x86/mach-bigsmp/mach_mpspec.h b/include/asm-x86/mach-bigsmp/mach_mpspec.h deleted file mode 100644 index 6b5dadc..0000000 --- a/include/asm-x86/mach-bigsmp/mach_mpspec.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ASM_MACH_MPSPEC_H -#define __ASM_MACH_MPSPEC_H - -#define MAX_IRQ_SOURCES 256 - -#define MAX_MP_BUSSES 32 - -#endif /* __ASM_MACH_MPSPEC_H */ diff --git a/include/asm-x86/mach-es7000/mach_mpspec.h b/include/asm-x86/mach-es7000/mach_mpspec.h deleted file mode 100644 index b1f5039..0000000 --- a/include/asm-x86/mach-es7000/mach_mpspec.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ASM_MACH_MPSPEC_H -#define __ASM_MACH_MPSPEC_H - -#define MAX_IRQ_SOURCES 256 - -#define MAX_MP_BUSSES 256 - -#endif /* __ASM_MACH_MPSPEC_H */ diff --git a/include/asm-x86/mach-numaq/mach_mpspec.h b/include/asm-x86/mach-numaq/mach_mpspec.h deleted file mode 100644 index dffb098..0000000 --- a/include/asm-x86/mach-numaq/mach_mpspec.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ASM_MACH_MPSPEC_H -#define __ASM_MACH_MPSPEC_H - -#define MAX_IRQ_SOURCES 512 - -#define MAX_MP_BUSSES 32 - -#endif /* __ASM_MACH_MPSPEC_H */ diff --git a/include/asm-x86/mach-summit/mach_mpspec.h b/include/asm-x86/mach-summit/mach_mpspec.h deleted file mode 100644 index bd76552..0000000 --- a/include/asm-x86/mach-summit/mach_mpspec.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __ASM_MACH_MPSPEC_H -#define __ASM_MACH_MPSPEC_H - -#define MAX_IRQ_SOURCES 256 - -/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ -#define MAX_MP_BUSSES 260 - -#endif /* __ASM_MACH_MPSPEC_H */ -- cgit v1.1 From 95a71a45c250177854f7c530810c88a8a19a443b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 18 Jun 2008 17:27:08 -0700 Subject: x86: cleanup machine_specific_memory_setup, v2 1. let 64bit support 88 and e801 too 2. introduce default_machine_specific_memory_setup, and reuse it for voyager v2: fix 64 bit compiling Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 13 +++++++------ arch/x86/mach-voyager/setup.c | 32 +------------------------------- include/asm-x86/e820.h | 3 +-- include/asm-x86/setup.h | 3 ++- 4 files changed, 11 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4947748..7b613d2 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1029,7 +1029,7 @@ void __init e820_reserve_resources(void) } } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) +char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; int new_nr; @@ -1045,10 +1045,7 @@ char *__init __attribute__((weak)) machine_specific_memory_setup(void) &new_nr); boot_params.e820_entries = new_nr; if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { -#ifdef CONFIG_X86_64 - early_panic("Cannot find a valid memory map"); -#else - unsigned long mem_size; + u64 mem_size; /* compare results from other methods and take the greater */ if (boot_params.alt_mem_k @@ -1063,13 +1060,17 @@ char *__init __attribute__((weak)) machine_specific_memory_setup(void) e820.nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); -#endif } /* In case someone cares... */ return who; } +char *__init __attribute__((weak)) machine_specific_memory_setup(void) +{ + return default_machine_specific_memory_setup(); +} + /* Overridden in paravirt.c if CONFIG_PARAVIRT */ char * __init __attribute__((weak)) memory_setup(void) { diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index f27b583..6bbdd63 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -104,35 +104,5 @@ char *__init machine_specific_memory_setup(void) return who; } - who = "BIOS-e820"; - - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - new_nr = boot_params.e820_entries; - sanitize_e820_map(boot_params.e820_map, - ARRAY_SIZE(boot_params.e820_map), - &new_nr); - boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) - < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { - mem_size = boot_params.screen_info.ext_mem_k; - who = "BIOS-88"; - } else { - mem_size = boot_params.alt_mem_k; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - return who; + return default_machine_specific_memory_setup(); } diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 83144cb..668a0d7 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -102,6 +102,7 @@ extern u64 e820_hole_size(u64 start, u64 end); extern void finish_e820_parsing(void); extern void e820_reserve_resources(void); extern void setup_memory_map(void); +extern char *default_machine_specific_memory_setup(void); extern char *machine_specific_memory_setup(void); extern char *memory_setup(void); @@ -116,9 +117,7 @@ extern char *memory_setup(void); #ifdef __KERNEL__ #include <linux/ioport.h> -#ifdef CONFIG_X86_32 #define HIGH_MEMORY (1024*1024) -#endif #endif /* __KERNEL__ */ #endif /* __ASM_E820_H */ diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index b676b0b..e14b6e7 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -42,13 +42,14 @@ void vsmp_init(void); */ extern struct boot_params boot_params; -#ifdef __i386__ /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. */ #define LOWMEMSIZE() (0x9f000) +#ifdef __i386__ + void __init i386_start_kernel(void); extern unsigned long init_pg_tables_start; -- cgit v1.1 From fcfa146e412023dd55f8855f240b2c2082dc1baa Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 18 Jun 2008 17:29:31 -0700 Subject: x86: update mptable fix with no ioapic v2 if the system doesn't have ioapic, we don't need to store entries for mptable update also let mp_config_acpi_gsi not call func in mpparse so later could decouple mpparse with acpi more easily Reported-by: Daniel Exner <dex@dragonslave.de> Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Daniel Exner <dex@dragonslave.de> Cc: Len Brown <lenb@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 87 +++++++++++++++++++++++++++++---------------- arch/x86/kernel/mpparse.c | 19 +++++----- include/asm-x86/mpspec.h | 2 -- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 29b365a..91bb9a9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -960,10 +960,37 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +static void assign_to_mp_irq(struct mp_config_intsrc *m, + struct mp_config_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); +} + +static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, + struct mp_config_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); +} + +static void save_mp_irq(struct mp_config_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { int ioapic; int pin; + struct mp_config_intsrc mp_irq; /* * Convert 'gsi' to 'ioapic.pin'. @@ -981,18 +1008,15 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) if ((bus_irq == 0) && (trigger == 3)) trigger = 1; - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mp_irqflag = (trigger << 2) | polarity; - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_srcbusirq = bus_irq; /* IRQ */ - mp_irqs[mp_irq_entries].mp_dstapic = - mp_ioapics[ioapic].mp_apicid; /* APIC ID */ - mp_irqs[mp_irq_entries].mp_dstirq = pin; /* INTIN# */ - - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); + mp_irq.mp_type = MP_INTSRC; + mp_irq.mp_irqtype = mp_INT; + mp_irq.mp_irqflag = (trigger << 2) | polarity; + mp_irq.mp_srcbus = MP_ISA_BUS; + mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ + mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ + mp_irq.mp_dstirq = pin; /* INTIN# */ + save_mp_irq(&mp_irq); } void __init mp_config_acpi_legacy_irqs(void) @@ -1000,6 +1024,7 @@ void __init mp_config_acpi_legacy_irqs(void) int i; int ioapic; unsigned int dstapic; + struct mp_config_intsrc mp_irq; #if defined (CONFIG_MCA) || defined (CONFIG_EISA) /* @@ -1052,16 +1077,15 @@ void __init mp_config_acpi_legacy_irqs(void) continue; /* IRQ already used */ } - mp_irqs[mp_irq_entries].mp_type = MP_INTSRC; - mp_irqs[mp_irq_entries].mp_irqflag = 0; /* Conforming */ - mp_irqs[mp_irq_entries].mp_srcbus = MP_ISA_BUS; - mp_irqs[mp_irq_entries].mp_dstapic = dstapic; - mp_irqs[mp_irq_entries].mp_irqtype = mp_INT; - mp_irqs[mp_irq_entries].mp_srcbusirq = i; /* Identity mapped */ - mp_irqs[mp_irq_entries].mp_dstirq = i; + mp_irq.mp_type = MP_INTSRC; + mp_irq.mp_irqflag = 0; /* Conforming */ + mp_irq.mp_srcbus = MP_ISA_BUS; + mp_irq.mp_dstapic = dstapic; + mp_irq.mp_irqtype = mp_INT; + mp_irq.mp_srcbusirq = i; /* Identity mapped */ + mp_irq.mp_dstirq = i; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); + save_mp_irq(&mp_irq); } } @@ -1169,25 +1193,26 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity) { - struct mpc_config_intsrc intsrc; +#ifdef CONFIG_X86_MPPARSE + struct mp_config_intsrc mp_irq; int ioapic; - if (!enable_update_mptable) + if (!acpi_ioapic) return 0; /* print the entry should happen on mptable identically */ - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + mp_irq.mp_type = MP_INTSRC; + mp_irq.mp_irqtype = mp_INT; + mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - intsrc.mpc_srcbus = number; - intsrc.mpc_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + mp_irq.mp_srcbus = number; + mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - intsrc.mpc_dstapic = mp_ioapic_routing[ioapic].apic_id; - intsrc.mpc_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; - - MP_intsrc_info(&intsrc); + mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; + mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; + save_mp_irq(&mp_irq); +#endif return 0; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 014ac5d..8b6b1e0 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -34,8 +34,6 @@ #include <mach_mpparse.h> #endif -int enable_update_mptable; - /* * Checksum an MP configuration block. */ @@ -246,7 +244,7 @@ static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); } -static void assign_to_mp_irq(struct mpc_config_intsrc *m, +static void __init assign_to_mp_irq(struct mpc_config_intsrc *m, struct mp_config_intsrc *mp_irq) { mp_irq->mp_dstapic = m->mpc_dstapic; @@ -270,7 +268,7 @@ static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, m->mpc_dstirq = mp_irq->mp_dstirq; } -static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, +static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, struct mpc_config_intsrc *m) { if (mp_irq->mp_dstapic != m->mpc_dstapic) @@ -291,17 +289,16 @@ static int mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, return 0; } -void MP_intsrc_info(struct mpc_config_intsrc *m) +static void __init MP_intsrc_info(struct mpc_config_intsrc *m) { int i; print_MP_intsrc_info(m); - if (enable_update_mptable) - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) - return; - } + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) + return; + } assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); if (++mp_irq_entries == MAX_IRQ_SOURCES) @@ -1113,6 +1110,8 @@ out: return 0; } +static int __initdata enable_update_mptable; + static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index f48dbca..6ec1a54 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -59,9 +59,7 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); -extern void MP_intsrc_info(struct mpc_config_intsrc *m); #ifdef CONFIG_X86_IO_APIC -extern int enable_update_mptable; extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); #else -- cgit v1.1 From 6695c85b2e6f9f2e9ccaa8af38b72f5ab4a5184f Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 19 Jun 2008 12:13:09 -0700 Subject: x86: let MPS support be selectable, v2 v2: seperate "fix for compiling when MPPARSE is not set" to another patch make X86_MPPARSE to be selectable only when acpi is set and X86_MPPARSE will be set if acpi is not set. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Maciej W. Rozycki <macro@linux-mips.org> Cc: Len Brown <lenb@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 21 +++++++++++++++++++++ arch/x86/Kconfig.debug | 9 --------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7ebd986..0804a88 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -230,6 +230,27 @@ config SMP If you don't know what to do here, say N. +config X86_FIND_SMP_CONFIG + def_bool y + depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS + depends on X86_32 + +if ACPI +config X86_MPPARSE + def_bool y + bool "Enable MPS table" + depends on ((X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64) + help + For old smp systems that do not have proper acpi support. Newer systems + (esp with 64bit cpus) with acpi support, MADT and DSDT will override it +endif + +if !ACPI +config X86_MPPARSE + def_bool y + depends on ((X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64) +endif + choice prompt "Subarchitecture Type" default X86_PC diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 253e7a5..f7d413d 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -129,15 +129,6 @@ config 4KSTACKS on the VM subsystem for higher order allocations. This option will also use IRQ stacks to compensate for the reduced stackspace. -config X86_FIND_SMP_CONFIG - def_bool y - depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS - depends on X86_32 - -config X86_MPPARSE - def_bool y - depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64 - config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED -- cgit v1.1 From a4caa18efe468acb3522e30763de57a67b3e438b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 19 Jun 2008 12:15:01 -0700 Subject: x86: fix compiling when CONFIG_X86_MPPARSE is not set Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 2 +- arch/x86/mm/k8topology_64.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0804a88..2525179 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -307,7 +307,7 @@ if X86_GENERICARCH config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" - depends on SMP && X86_32 + depends on SMP && X86_32 && X86_MPPARSE select NUMA help This option is used for getting Linux to run on a NUMAQ (IBM/Sequent) diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 1f476e4..f75aa2a 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -56,18 +56,22 @@ static __init void early_get_boot_cpu_id(void) /* * Find possible boot-time SMP configuration: */ +#ifdef CONFIG_X86_MPPARSE early_find_smp_config(); +#endif #ifdef CONFIG_ACPI /* * Read APIC information from ACPI tables. */ early_acpi_boot_init(); #endif +#ifdef CONFIG_X86_MPPARSE /* * get boot-time SMP configuration: */ if (smp_found_config) early_get_smp_config(); +#endif early_init_lapic_mapping(); } -- cgit v1.1 From c73d8dd8595c4c6c1c016bb1ac4dd8035e67975b Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Tue, 8 Jul 2008 10:47:39 +0200 Subject: Revert parts of "x86: update mptable" Signed-off-by: Ingo Molnar <mingo@elte.hu> --- drivers/acpi/pci_irq.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index e556f30..89022a7 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -570,11 +570,6 @@ int acpi_pci_irq_enable(struct pci_dev *dev) (triggering == ACPI_LEVEL_SENSITIVE) ? "level" : "edge", (polarity == ACPI_ACTIVE_LOW) ? "low" : "high", dev->irq); -#ifdef CONFIG_X86 - mp_config_acpi_gsi(dev->bus->number, dev->devfn, dev->pin, irq, - triggering, polarity); -#endif - return 0; } -- cgit v1.1 From bad48f4b313a756ccde454c25c14c828e2fd5819 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 20 Jun 2008 07:33:31 -0700 Subject: x86: simplify x86_mpparse dependency check "Maciej W. Rozycki" <macro@linux-mips.org> said: > Given X86_64 selects X86_LOCAL_APIC I am not sure the redundancy seen >above does not actually obscure the logic behind... I think: > > depends on X86_LOCAL_APIC && !X86_VISWS > >would be clearer and get the same. Suggested-by: Maciej W. Rozycki <macro@linux-mips.org> Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Len Brown <lenb@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2525179..23c352e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -239,7 +239,7 @@ if ACPI config X86_MPPARSE def_bool y bool "Enable MPS table" - depends on ((X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64) + depends on X86_LOCAL_APIC && !X86_VISWS help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it @@ -248,7 +248,7 @@ endif if !ACPI config X86_MPPARSE def_bool y - depends on ((X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64) + depends on X86_LOCAL_APIC && !X86_VISWS endif choice -- cgit v1.1 From 471694ea6c6ccdf7131354f1d698d4d83a605293 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" <macro@linux-mips.org> Date: Tue, 1 Jul 2008 01:11:35 +0100 Subject: x86, ioapic, acpi: add a knob to disable IRQ 0 through I/O APIC As discovered recently some systems exhibit problems when the 8254 timer IRQ is routed through the I/O APIC. These problems do not affect the timer IRQ itself and therefore cannot be detected when the correctness of operation of the interrupt is verified in check_timer(). Therefore the I/O APIC path of the timer IRQ has to be disabled entirely. This is a change that lets platforms ask for the timer IRQ not to be registered in the I/O APIC interrupt tables. The local APIC and ExtINTA paths are unaffected. This request is only taken into account for ACPI platforms as MP table systems seem unaffected so far. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Cc: Len Brown <lenb@kernel.org> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 91bb9a9..92a5426 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -83,6 +83,8 @@ int acpi_lapic; int acpi_ioapic; int acpi_strict; +static int disable_irq0_through_ioapic __initdata; + u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; @@ -992,6 +994,10 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) int pin; struct mp_config_intsrc mp_irq; + /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ + if (bus_irq == 0 && disable_irq0_through_ioapic) + return; + /* * Convert 'gsi' to 'ioapic.pin'. */ @@ -1058,6 +1064,10 @@ void __init mp_config_acpi_legacy_irqs(void) for (i = 0; i < 16; i++) { int idx; + /* Skip the 8254 timer interrupt (IRQ 0) if requested. */ + if (i == 0 && disable_irq0_through_ioapic) + continue; + for (idx = 0; idx < mp_irq_entries; idx++) { struct mp_config_intsrc *irq = mp_irqs + idx; -- cgit v1.1 From 9340e1ccdf7b9b22a2be7f51cd74e8b5e11961bf Mon Sep 17 00:00:00 2001 From: Matthew Garrett <mjg59@srcf.ucam.org> Date: Tue, 1 Jul 2008 01:12:06 +0100 Subject: x86, ioapic, acpi quirk: disable IRQ 0 through I/O APIC for some HP systems Some HP laptops have a problem with their DSDT reporting as HP/SB400/10000, which includes some code which overrides all temperature trip points to 16C if the INTIN2 input of the I/O APIC is enabled. This input is incorrectly designated the ISA IRQ 0 via an interrupt source override even though it is wired to the output of the master 8259A and INTIN0 is not connected at all. So far two models have been identified, namely nx6125 and nx6325. Use a knob provided by the I/O APIC interrupt registration code to abandon any attempts to route IRQ 0 through the I/O APIC for these systems. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Cc: Len Brown <lenb@kernel.org> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 92a5426..9908ef4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1427,6 +1427,17 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) } /* + * Don't register any I/O APIC entries for the 8254 timer IRQ. + */ +static int __init +dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d) +{ + pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident); + disable_irq0_through_ioapic = 1; + return 0; +} + +/* * If your system is blacklisted here, but you find that acpi=force * works for you, please contact acpi-devel@sourceforge.net */ @@ -1593,6 +1604,32 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + /* + * HP laptops which use a DSDT reporting as HP/SB400/10000, + * which includes some code which overrides all temperature + * trip points to 16C if the INTIN2 input of the I/O APIC + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 + * is not connected at all. Abandon any attempts to route + * IRQ 0 through the I/O APIC therefore. + */ + { + .callback = dmi_disable_irq0_through_ioapic, + .ident = "HP NX6125 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), + }, + }, + { + .callback = dmi_disable_irq0_through_ioapic, + .ident = "HP NX6325 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, {} }; -- cgit v1.1 From b755de8dfdfef97effaa91379ffafcb81f4d62a1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <Yinghai.Lu@Sun.COM> Date: Wed, 20 Feb 2008 12:41:52 -0800 Subject: x86: make dev_to_node return online node a numa system (with multi HT chains) may return node without ram. Aka it is not online. Try to get an online node, otherwise return -1. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/pci/acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index d95de2f..ea8685f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -172,6 +172,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do set_mp_bus_to_node(busnum, node); else node = get_mp_bus_to_node(busnum); + + if (node != -1 && !node_online(node)) + node = -1; #endif /* Allocate per-root-bus (not per bus) arch-specific data. -- cgit v1.1 From dbb6152e6f72df367f8a955586c5e6282a7255e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel.send@gmail.com> Date: Sat, 19 Apr 2008 01:30:16 -0700 Subject: x86: don't call pxm_to_node again also make bus_numa work even if ACPI_NUMA is not defined. don't call pxm_to_node again, and use node directly. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/pci/acpi.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ea8685f..28d17a5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -171,11 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do if (node != -1) set_mp_bus_to_node(busnum, node); else +#endif node = get_mp_bus_to_node(busnum); if (node != -1 && !node_online(node)) node = -1; -#endif /* Allocate per-root-bus (not per bus) arch-specific data. * TODO: leak; this memory is never freed. @@ -207,14 +207,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do if (!bus) kfree(sd); + if (bus && node != -1) { #ifdef CONFIG_ACPI_NUMA - if (bus) { - if (pxm >= 0) { + if (pxm >= 0) printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", - busnum, pxm, pxm_to_node(pxm)); - } - } + busnum, pxm, node); +#else + printk(KERN_DEBUG "bus %02x -> node %d\n", + busnum, node); #endif + } if (bus && (pci_probe & PCI_USE__CRS)) get_current_resources(device, busnum, domain, bus); -- cgit v1.1 From 7496b60654e759d0b9008b80908e80727904b3c4 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: fix remove cpu_pda table patch Mike Travis wrote: > Ingo Molnar wrote: >> * Mike Travis <travis@sgi.com> wrote: >> >>> [Ingo - please replace "PATCH 07/11" with this one.] >>> >>> * Remove 544k bytes from the kernel by removing the boot_cpu_pda >>> array from the data section and allocating it during startup. >>> >>> Fixed panic in setup_per_cpu_areas when HOTPLUG_CPU not set. >>> >>> For inclusion into sched-devel/latest tree. >> sched-devel.git randconfig testing found another crash with your queue: >> >> [ 0.111060] Brought up 1 CPUs >> [ 0.111986] Total of 1 processors activated (4022.73 BogoMIPS). >> [ 0.112987] Testing NMI watchdog ... <1>BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 >> [ 0.114982] IP: [<ffffffff8180d4a0>] check_nmi_watchdog+0xb0/0x210 >> [ 0.114982] PGD 0 >> [ 0.114982] Oops: 0000 [1] SMP >> [ 0.114982] CPU 0 >> [............] >> >> http://redhat.com/~mingo/misc/config-Mon_Apr_28_23_25_25_CEST_2008.bad >> http://redhat.com/~mingo/misc/log-Mon_Apr_28_23_25_25_CEST_2008.bad >> >> Ingo > > Hi Ingo, > > I need a bit more information on your hardware configuration. Building a > kernel with the above config file started up fine on both the Intel and AMD > boxes. > > Based on the above output it looks like it might be a UP machine? ... Ok, I think I found it. In check_nmi_watchdog(): for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; As I mentioned it works fine on both of my systems so could you try it out? Thanks! Mike -- * Change function check_nmi_watchdog() to use nr_cpu_ids instead of NR_CPUS. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/nmi_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 5a29ded..2861b94 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -88,7 +88,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) return 0; - prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) return -1; @@ -99,7 +99,7 @@ int __init check_nmi_watchdog(void) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); #endif - for (cpu = 0; cpu < NR_CPUS; cpu++) + for (cpu = 0; cpu < nr_cpu_ids; cpu++) prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks -- cgit v1.1 From 1184dc2ffe2c8fb9afb766d870850f2c3165ef25 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: modify Kconfig to allow up to 4096 cpus * Increase the limit of NR_CPUS to 4096 and introduce a boolean called "MAXSMP" which when set (e.g. "allyesconfig"), will set NR_CPUS = 4096 and NODES_SHIFT = 9 (512). * Changed max setting for NODES_SHIFT from 15 to 9 to accurately reflect the real limit. Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bf07b6f..2e32552 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -561,20 +561,35 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) +config MAXSMP + bool "Configure Maximum number of SMP Processors and NUMA Nodes" + depends on X86_64 && SMP + default n + help + Configure maximum number of CPUS and NUMA Nodes for this architecture. + If unsure, say N. +if MAXSMP config NR_CPUS - int "Maximum number of CPUs (2-255)" - range 2 255 + int + default "4096" +endif + +if !MAXSMP +config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 depends on SMP default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 255 and the + kernel will support. The maximum supported value is 4096 and the minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. +endif config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -965,13 +980,25 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +if MAXSMP + +config NODES_SHIFT + int + default "9" +endif + +if !MAXSMP config NODES_SHIFT - int "Max num nodes shift(1-9)" - range 1 9 if X86_64 + int "Maximum NUMA Nodes (as a power of 2)" + range 1 9 if X86_64 default "6" if X86_64 default "4" if X86_NUMAQ default "3" depends on NEED_MULTIPLE_NODES + help + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accomodate various tables. +endif config HAVE_ARCH_BOOTMEM_NODE def_bool y -- cgit v1.1 From 23ca4bba3e20c6c3cb11c1bb0ab4770b724d39ac Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: cleanup early per cpu variables/accesses v4 * Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is used by some per_cpu variables that are initialized and accessed before there are per_cpu areas allocated. ["Early" in respect to per_cpu variables is "earlier than the per_cpu areas have been setup".] This patchset adds these new macros: DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) EXPORT_EARLY_PER_CPU_SYMBOL(_name) DECLARE_EARLY_PER_CPU(_type, _name) early_per_cpu_ptr(_name) early_per_cpu_map(_name, _idx) early_per_cpu(_name, _cpu) The DEFINE macro defines the per_cpu variable as well as the early map and pointer. It also initializes the per_cpu variable and map elements to "_initvalue". The early_* macros provide access to the initial map (usually setup during system init) and the early pointer. This pointer is initialized to point to the early map but is then NULL'ed when the actual per_cpu areas are setup. After that the per_cpu variable is the correct access to the variable. The early_per_cpu() macro is not very efficient but does show how to access the variable if you have a function that can be called both "early" and "late". It tests the early ptr to be NULL, and if not then it's still valid. Otherwise, the per_cpu variable is used instead: #define early_per_cpu(_name, _cpu) \ (early_per_cpu_ptr(_name) ? \ early_per_cpu_ptr(_name)[_cpu] : \ per_cpu(_name, _cpu)) A better method is to actually check the pointer manually. In the case below, numa_set_node can be called both "early" and "late": void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); if (cpu_to_node_map) cpu_to_node_map[cpu] = node; else per_cpu(x86_cpu_to_node_map, cpu) = node; } * Add a flag "arch_provides_topology_pointers" that indicates pointers to topology cpumask_t maps are available. Otherwise, use the function returning the cpumask_t value. This is useful if cpumask_t set size is very large to avoid copying data on to/off of the stack. * The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while the non-debug case has been optimized a bit. * Remove an unreferenced compiler warning in drivers/base/topology.c * Clean up #ifdef in setup.c For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/apic_32.c | 9 +-- arch/x86/kernel/apic_64.c | 11 ++-- arch/x86/kernel/setup.c | 96 ++++++++++++++++++++++++++---- arch/x86/kernel/setup_32.c | 24 -------- arch/x86/kernel/setup_64.c | 9 --- arch/x86/kernel/smpboot.c | 20 +------ arch/x86/mm/numa_64.c | 43 ++++---------- arch/x86/mm/srat_64.c | 2 +- drivers/base/topology.c | 25 +++++++- include/asm-x86/numa_64.h | 19 +++--- include/asm-x86/percpu.h | 46 +++++++++++++++ include/asm-x86/smp.h | 15 +---- include/asm-x86/topology.h | 143 ++++++++++++++++++++++++++------------------- 15 files changed, 270 insertions(+), 196 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2e32552..4469a0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE def_bool y config HAVE_SETUP_PER_CPU_AREA - def_bool X86_64 || (X86_SMP && !X86_VOYAGER) + def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 1836337..24ca95a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -60,7 +60,7 @@ config DEBUG_PAGEALLOC config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL - depends on X86_64_SMP + depends on X86_SMP default n help Say Y to verify that the per_cpu map being accessed has diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1b..f17c1c1 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -52,9 +52,6 @@ unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - /* * Knob to control our willingness to enable the local APIC. * @@ -1534,9 +1531,9 @@ void __cpuinit generic_processor_info(int apicid, int version) } #ifdef CONFIG_SMP /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd..4fd21f7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -87,9 +87,6 @@ static unsigned long apic_phys; unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version @@ -1091,9 +1088,9 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu = 0; } /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; @@ -1269,7 +1266,7 @@ __cpuinit int apic_is_clustered_box(void) if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) return 0; - bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b85..03caa8e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -19,13 +19,23 @@ unsigned disabled_cpus __cpuinitdata; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL(boot_cpu_physical_apicid); -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); - /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; #endif +/* map cpu index to physical APIC ID */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#define X86_64_NUMA 1 + +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +#endif + #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) /* * Copy data used in early init routines from the initial arrays to the @@ -37,20 +47,21 @@ static void __init setup_per_cpu_maps(void) int cpu; for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = - x86_bios_cpu_apicid_init[cpu]; -#ifdef CONFIG_NUMA + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA per_cpu(x86_cpu_to_node_map, cpu) = - x86_cpu_to_node_map_init[cpu]; + early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif } /* indicate the early static arrays will soon be gone */ - x86_cpu_to_apicid_early_ptr = NULL; - x86_bios_cpu_apicid_early_ptr = NULL; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = NULL; + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif } @@ -109,7 +120,8 @@ void __init setup_per_cpu_areas(void) if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO - "cpu %d has no node or node-local memory\n", i); + "cpu %d has no node %d or node-local memory\n", + i, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); @@ -137,3 +149,63 @@ void __init setup_per_cpu_areas(void) } #endif + +#ifdef X86_64_NUMA +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + if (cpu_to_node_map) + cpu_to_node_map[cpu] = node; + + else if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +} +#endif /* CONFIG_NUMA */ + +#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64) + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +#endif diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5a2f8e0..ccd5f5c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -737,18 +737,6 @@ char * __init __attribute__((weak)) memory_setup(void) return machine_specific_memory_setup(); } -#ifdef CONFIG_NUMA -/* - * In the golden day, when everything among i386 and x86_64 will be - * integrated, this will not live here - */ -void *x86_cpu_to_node_map_early_ptr; -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -#endif - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -887,18 +875,6 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); -#ifdef CONFIG_X86_SMP - /* - * setup to use the early static init tables during kernel startup - * X86_SMP will exclude sub-arches that don't deal well with it. - */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff128..e8df64f 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -406,15 +406,6 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#ifdef CONFIG_SMP - /* setup to use the early static init tables during kernel startup */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e1cece..036604d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,22 +68,6 @@ #include <mach_wakecpu.h> #include <smpboot_hooks.h> -/* - * FIXME: For x86_64, those are defined in other files. But moving them here, - * would make the setup areas dependent on smp, which is a loss. When we - * integrate apic between arches, we can probably do a better job, but - * right now, they'll stay here -- glommer - */ - -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = - { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_early_ptr; - -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata - = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_bios_cpu_apicid_early_ptr; - #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; static int low_mappings; @@ -992,7 +976,7 @@ do_rest: /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); #ifdef CONFIG_X86_64 - clear_node_cpumask(cpu); /* was set by numa_add_cpu */ + numa_remove_cpu(cpu); /* was set by numa_add_cpu */ #endif cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ @@ -1373,7 +1357,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ clear_bit(cpu, (unsigned long *)&cpu_initialized); - clear_node_cpumask(cpu); + numa_remove_cpu(cpu); #endif } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d5..970f867 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,16 +31,6 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -#ifdef CONFIG_SMP -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -void *x86_cpu_to_node_map_early_ptr; -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); -#endif -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); - s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -577,24 +567,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__cpuinit void numa_add_cpu(int cpu) -{ - set_bit(cpu, - (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if(cpu_to_node_map) - cpu_to_node_map[cpu] = node; - else if(per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - unsigned long __init numa_free_all_bootmem(void) { unsigned long pages = 0; @@ -641,6 +613,7 @@ static __init int numa_setup(char *opt) } early_param("numa", numa_setup); +#ifdef CONFIG_NUMA /* * Setup early cpu_to_node. * @@ -652,14 +625,19 @@ early_param("numa", numa_setup); * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { - int i; + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - for (i = 0; i < NR_CPUS; i++) { + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { int node; - u16 apicid = x86_cpu_to_apicid_init[i]; + u16 apicid = cpu_to_apicid[cpu]; if (apicid == BAD_APICID) continue; @@ -668,8 +646,9 @@ void __init init_cpu_to_node(void) continue; if (!node_online(node)) continue; - numa_set_node(i, node); + numa_set_node(cpu, node); } } +#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 99649dc..012220e 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; if (!node_isset(node, node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); + numa_clear_node(i); } numa_init_array(); return 0; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index fdf4044..1efe162 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \ return sprintf(buf, "%d\n", topology_##name(cpu)); \ } +#if defined(topology_thread_siblings) || defined(topology_core_siblings) static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) { ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; @@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) } return n; } +#endif +#ifdef arch_provides_topology_pointers #define define_siblings_show_map(name) \ -static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(0, &(topology_##name(cpu)), buf); \ } #define define_siblings_show_list(name) \ -static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(1, &(topology_##name(cpu)), buf); \ } +#else +#define define_siblings_show_map(name) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(0, &mask, buf); \ +} + +#define define_siblings_show_list(name) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(1, &mask, buf); \ +} +#endif + #define define_siblings_show_func(name) \ define_siblings_show_map(name); define_siblings_show_list(name) diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h index 22e87c9..b510daf 100644 --- a/include/asm-x86/numa_64.h +++ b/include/asm-x86/numa_64.h @@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_add_cpu(int cpu); extern void numa_init_array(void); extern int numa_off; -extern void numa_set_node(int cpu, int node); extern void srat_reserve_add_area(int nodeid); extern int hotadd_percent; @@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, #ifdef CONFIG_NUMA extern void __init init_cpu_to_node(void); - -static inline void clear_node_cpumask(int cpu) -{ - clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]); -} - +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); #else -#define init_cpu_to_node() do {} while (0) -#define clear_node_cpumask(cpu) do {} while (0) +static inline void init_cpu_to_node(void) { } +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu, int node) { } +static inline void numa_remove_cpu(int cpu) { } #endif #endif diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h index 736fc3b..912a3a1 100644 --- a/include/asm-x86/percpu.h +++ b/include/asm-x86/percpu.h @@ -143,4 +143,50 @@ do { \ #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) #endif /* !__ASSEMBLY__ */ #endif /* !CONFIG_X86_64 */ + +#ifdef CONFIG_SMP + +/* + * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu + * variables that are initialized and accessed before there are per_cpu + * areas allocated. + */ + +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue; \ + __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \ + { [0 ... NR_CPUS-1] = _initvalue }; \ + __typeof__(_type) *_name##_early_ptr = _name##_early_map + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ + extern __typeof__(_type) _name##_early_map[] + +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) +#define early_per_cpu(_name, _cpu) \ + (early_per_cpu_ptr(_name) ? \ + early_per_cpu_ptr(_name)[_cpu] : \ + per_cpu(_name, _cpu)) + +#else /* !CONFIG_SMP */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name) + +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL +/* no early_per_cpu_map() */ + +#endif /* !CONFIG_SMP */ + #endif /* _ASM_X86_PERCPU_H_ */ diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 1ebaa5c..ec84163 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -29,21 +29,12 @@ extern int smp_num_siblings; extern unsigned int num_processors; extern cpumask_t cpu_initialized; -#ifdef CONFIG_SMP -extern u16 x86_cpu_to_apicid_init[]; -extern u16 x86_bios_cpu_apicid_init[]; -extern void *x86_cpu_to_apicid_early_ptr; -extern void *x86_bios_cpu_apicid_early_ptr; -#else -#define x86_cpu_to_apicid_early_ptr NULL -#define x86_bios_cpu_apicid_early_ptr NULL -#endif - DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); + +DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); /* Static state in head.S used to set up a CPU */ extern struct { diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index dcf3f81..dac09cb 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -35,87 +35,67 @@ # endif #endif +/* Node not present */ +#define NUMA_NO_NODE (-1) + #ifdef CONFIG_NUMA #include <linux/cpumask.h> #include <asm/mpspec.h> -/* Mappings between logical cpu number and node number */ #ifdef CONFIG_X86_32 -extern int cpu_to_node_map[]; -#else -/* Returns the number of the current Node. */ -#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id())) -#endif - -DECLARE_PER_CPU(int, x86_cpu_to_node_map); - -#ifdef CONFIG_SMP -extern int x86_cpu_to_node_map_init[]; -extern void *x86_cpu_to_node_map_early_ptr; -#else -#define x86_cpu_to_node_map_early_ptr NULL -#endif +/* Mappings between node number and cpus on that node. */ extern cpumask_t node_to_cpumask_map[]; -#define NUMA_NO_NODE (-1) +/* Mappings between logical cpu number and node number */ +extern int cpu_to_node_map[]; /* Returns the number of the node containing CPU 'cpu' */ -#ifdef CONFIG_X86_32 -#define early_cpu_to_node(cpu) cpu_to_node(cpu) static inline int cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } +#define early_cpu_to_node(cpu) cpu_to_node(cpu) #else /* CONFIG_X86_64 */ -#ifdef CONFIG_SMP -static inline int early_cpu_to_node(int cpu) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if (cpu_to_node_map) - return cpu_to_node_map[cpu]; - else if (per_cpu_offset(cpu)) - return per_cpu(x86_cpu_to_node_map, cpu); - else - return NUMA_NO_NODE; -} -#else -#define early_cpu_to_node(cpu) cpu_to_node(cpu) -#endif +/* Mappings between node number and cpus on that node. */ +extern cpumask_t node_to_cpumask_map[]; + +/* Mappings between logical cpu number and node number */ +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); + +/* Returns the number of the current Node. */ +#define numa_node_id() (per_cpu(x86_cpu_to_node_map, raw_smp_processor_id())) + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +extern int cpu_to_node(int cpu); +extern int early_cpu_to_node(int cpu); +extern cpumask_t *_node_to_cpumask_ptr(int node); +extern cpumask_t node_to_cpumask(int node); +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +/* Returns the number of the node containing CPU 'cpu' */ static inline int cpu_to_node(int cpu) { -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (x86_cpu_to_node_map_early_ptr) { - printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n", - (int)cpu); - dump_stack(); - return ((int *)x86_cpu_to_node_map_early_ptr)[cpu]; - } -#endif return per_cpu(x86_cpu_to_node_map, cpu); } -#ifdef CONFIG_NUMA - -/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -#define node_to_cpumask_ptr(v, node) \ - cpumask_t *v = &(node_to_cpumask_map[node]) - -#define node_to_cpumask_ptr_next(v, node) \ - v = &(node_to_cpumask_map[node]) -#endif +/* Same function but used if called before per_cpu areas are setup */ +static inline int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; -#endif /* CONFIG_X86_64 */ + return per_cpu(x86_cpu_to_node_map, cpu); +} -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &node_to_cpumask_map[node]; +} /* Returns a bitmask of CPUs on Node 'node'. */ static inline cpumask_t node_to_cpumask(int node) @@ -123,14 +103,29 @@ static inline cpumask_t node_to_cpumask(int node) return node_to_cpumask_map[node]; } +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* CONFIG_X86_64 */ + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) + +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) + /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - cpumask_t mask = node_to_cpumask(node); - - return first_cpu(mask); + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); } +/* + * Returns the number of the node containing Node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + #define pcibus_to_node(bus) __pcibus_to_node(bus) #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) @@ -180,8 +175,31 @@ extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ + +#define numa_node_id() 0 +#define cpu_to_node(cpu) 0 +#define early_cpu_to_node(cpu) 0 + +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &cpu_online_map; +} +static inline cpumask_t node_to_cpumask(int node) +{ + return cpu_online_map; +} +static inline int node_to_first_cpu(int node) +{ + return first_cpu(cpu_online_map); +} + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) #endif #include <asm-generic/topology.h> @@ -193,6 +211,9 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) + +/* indicates that pointers to the topology cpumask_t maps are valid */ +#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) @@ -220,4 +241,4 @@ static inline void set_mp_bus_to_node(int busnum, int node) } #endif -#endif +#endif /* _ASM_X86_TOPOLOGY_H */ -- cgit v1.1 From 7891a24e1ee50c96896c0cf7da216a8e7b573ca5 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: restore pda nodenumber field * Restore the nodenumber field in the x86_64 pda. This field is slightly different than the x86_cpu_to_node_map mainly because it's a static indication of which node the cpu is on while the cpu to node map is a dyanamic mapping that may get reset if the cpu goes offline. This also simplifies the numa_node_id() macro. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/setup.c | 4 ++++ include/asm-x86/pda.h | 1 + include/asm-x86/topology.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 03caa8e..0dff17e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -32,6 +32,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) #define X86_64_NUMA 1 +/* map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); #endif @@ -155,6 +156,9 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + if (node != NUMA_NO_NODE) + cpu_pda(cpu)->nodenumber = node; + if (cpu_to_node_map) cpu_to_node_map[cpu] = node; diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index 101fb9e..de2ad9a 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,6 +22,7 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; + int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index dac09cb..c0e6ff7 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -66,7 +66,7 @@ extern cpumask_t node_to_cpumask_map[]; DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); /* Returns the number of the current Node. */ -#define numa_node_id() (per_cpu(x86_cpu_to_node_map, raw_smp_processor_id())) +#define numa_node_id() read_pda(nodenumber) #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); -- cgit v1.1 From 9f248bde9d47cc177011198c9a15fb339b9f3215 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: remove the static 256k node_to_cpumask_map * Consolidate node_to_cpumask operations and remove the 256k byte node_to_cpumask_map. This is done by allocating the node_to_cpumask_map array after the number of possible nodes (nr_node_ids) is known. * Debug printouts when CONFIG_DEBUG_PER_CPU_MAPS is active have been increased. It now shows faults when calling node_to_cpumask() and node_to_cpumask_ptr(). For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/setup.c | 132 +++++++++++++++++++++++++++++++++++++++++++-- arch/x86/mm/numa_64.c | 6 --- include/asm-x86/topology.h | 25 ++++++--- 3 files changed, 144 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0dff17e..913af83 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -35,6 +35,16 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* map cpu index to node index */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* which logical CPUs are on which nodes */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* setup node_to_cpumask_map */ +static void __init setup_node_to_cpumask_map(void); + +#else +static inline void setup_node_to_cpumask_map(void) { } #endif #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) @@ -140,11 +150,15 @@ void __init setup_per_cpu_areas(void) } nr_cpu_ids = highest_cpu + 1; - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", + NR_CPUS, nr_cpu_ids, nr_node_ids); /* Setup percpu data maps */ setup_per_cpu_maps(); + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + /* Setup cpumask_of_cpu map */ setup_cpumask_of_cpu(); } @@ -152,6 +166,35 @@ void __init setup_per_cpu_areas(void) #endif #ifdef X86_64_NUMA + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + + Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -174,6 +217,8 @@ void __cpuinit numa_clear_node(int cpu) numa_set_node(cpu, NUMA_NO_NODE); } +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + void __cpuinit numa_add_cpu(int cpu) { cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); @@ -183,9 +228,44 @@ void __cpuinit numa_remove_cpu(int cpu) { cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); } -#endif /* CONFIG_NUMA */ -#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64) +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), *mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); + } + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} int cpu_to_node(int cpu) { @@ -199,6 +279,10 @@ int cpu_to_node(int cpu) } EXPORT_SYMBOL(cpu_to_node); +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ int early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) @@ -207,9 +291,47 @@ int early_cpu_to_node(int cpu) if (!per_cpu_offset(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); + dump_stack(); return NUMA_NO_NODE; } return per_cpu(x86_cpu_to_node_map, cpu); } -#endif + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +cpumask_t *_node_to_cpumask_ptr(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return &cpu_online_map; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(_node_to_cpumask_ptr); + +/* + * Returns a bitmask of CPUs on Node 'node'. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + +#endif /* X86_64_NUMA */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 970f867..14c7ab4 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -35,9 +35,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_to_cpumask_map); - int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; @@ -560,9 +557,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - /* cpumask_of_cpu() may not be available during early startup */ - memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); - cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index c0e6ff7..1f97758 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -57,10 +57,16 @@ static inline int cpu_to_node(int cpu) } #define early_cpu_to_node(cpu) cpu_to_node(cpu) +/* Returns a bitmask of CPUs on Node 'node'. */ +static inline cpumask_t node_to_cpumask(int node) +{ + return node_to_cpumask_map[node]; +} + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ -extern cpumask_t node_to_cpumask_map[]; +extern cpumask_t *node_to_cpumask_map; /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -104,7 +110,6 @@ static inline cpumask_t node_to_cpumask(int node) } #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#endif /* CONFIG_X86_64 */ /* Replace default node_to_cpumask_ptr with optimized version */ #define node_to_cpumask_ptr(v, node) \ @@ -113,12 +118,7 @@ static inline cpumask_t node_to_cpumask(int node) #define node_to_cpumask_ptr_next(v, node) \ v = _node_to_cpumask_ptr(node) -/* Returns the number of the first CPU on Node 'node'. */ -static inline int node_to_first_cpu(int node) -{ - node_to_cpumask_ptr(mask, node); - return first_cpu(*mask); -} +#endif /* CONFIG_X86_64 */ /* * Returns the number of the node containing Node 'node'. This @@ -204,6 +204,15 @@ static inline int node_to_first_cpu(int node) #include <asm-generic/topology.h> +#ifdef CONFIG_NUMA +/* Returns the number of the first CPU on Node 'node'. */ +static inline int node_to_first_cpu(int node) +{ + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); +} +#endif + extern cpumask_t cpu_coregroup_map(int cpu); #ifdef ENABLE_TOPO_DEFINES -- cgit v1.1 From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: remove static boot_cpu_pda array v2 * Remove the boot_cpu_pda array and pointer table from the data section. Allocate the pointer table and array during init. do_boot_cpu() will reallocate the pda in node local memory and if the cpu is being brought up before the bootmem array is released (after_bootmem = 0), then it will free the initial pda. This will happen for all cpus present at system startup. This removes 512k + 32k bytes from the data section. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/head64.c | 26 +++++++++++++++-- arch/x86/kernel/setup.c | 73 ++++++++++++++++++++++++++++++++++++----------- arch/x86/kernel/setup64.c | 8 ++++-- arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++--------- include/asm-x86/pda.h | 6 ++-- include/linux/mm.h | 1 + 6 files changed, 135 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b..0ab59ed 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,24 @@ #include <asm/e820.h> #include <asm/bios_ebda.h> +/* boot cpu pda */ +static struct x8664_pda _boot_cpu_pda __read_mostly; + +#ifdef CONFIG_SMP +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * We install an empty cpu_pda pointer table to trap references before + * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +#else +static struct x8664_pda *__cpu_pda[1] __read_mostly; +#endif + +#else /* !CONFIG_SMP (NR_CPUS will be 1) */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +#endif + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; pda_init(0); + + early_printk("Kernel really alive\n"); + copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 913af83..dd12c1c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { } */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} #endif /* @@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i, highest_cpu = 0; - unsigned long size; + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); +#else + nr_cpu_ids = num_processors; #endif + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_possible_cpu(i) { - char *ptr; + for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES ptr = alloc_bootmem_pages(size); #else - int node = early_cpu_to_node(i); + int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", - i, node); + cpu, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); -#ifdef CONFIG_X86_64 - cpu_pda(i)->data_offset = ptr - __per_cpu_start; -#else - __per_cpu_offset[i] = ptr - __per_cpu_start; -#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - highest_cpu = i; } - nr_cpu_ids = highest_cpu + 1; printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", NR_CPUS, nr_cpu_ids, nr_node_ids); @@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (node != NUMA_NO_NODE) + if (cpu_pda(cpu) && node != NUMA_NO_NODE) cpu_pda(cpu)->nodenumber = node; if (cpu_to_node_map) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index aee0e82..631ea6c 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -12,6 +12,7 @@ #include <linux/bitops.h> #include <linux/module.h> #include <linux/kgdb.h> +#include <linux/topology.h> #include <asm/pda.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -34,9 +35,8 @@ struct boot_params boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -114,8 +114,10 @@ void pda_init(int cpu) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) panic("cannot allocate irqstack for cpu %d", cpu); - } + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } pda->irqstackptr += IRQSTACKSIZE-64; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 036604d..bf08334 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* + * Allocate node local memory for the AP pda. + * + * Must be called after the _cpu_pda pointer table is initialized. + */ +static int __cpuinit get_local_pda(int cpu) +{ + struct x8664_pda *oldpda, *newpda; + unsigned long size = sizeof(struct x8664_pda); + int node = cpu_to_node(cpu); + + if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) + return 0; + + oldpda = cpu_pda(cpu); + newpda = kmalloc_node(size, GFP_ATOMIC, node); + if (!newpda) { + printk(KERN_ERR "Could not allocate node local PDA " + "for CPU %d on node %d\n", cpu, node); + + if (oldpda) + return 0; /* have a usable pda */ + else + return -1; + } + + if (oldpda) { + memcpy(newpda, oldpda, size); + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, size); + } + + newpda->in_bootmem = 0; + cpu_pda(cpu) = newpda; + return 0; +} + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof(struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); + if (cpu > 0) { + boot_error = get_local_pda(cpu); + if (boot_error) + goto restore_state; + /* if can't get pda memory, can't start cpu */ } #endif @@ -972,6 +1001,8 @@ do_rest: } } +restore_state: + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); @@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void) for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); + + nr_cpu_ids = possible; } static void __ref remove_cpu_from_maps(int cpu) diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index de2ad9a..b34e9a7 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,7 +22,8 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; - int nodenumber; /* number of current node */ + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; @@ -38,8 +39,7 @@ struct x8664_pda { unsigned irq_spurious_count; } ____cacheline_aligned_in_smp; -extern struct x8664_pda *_cpu_pda[]; -extern struct x8664_pda boot_cpu_pda[]; +extern struct x8664_pda **_cpu_pda; extern void pda_init(int); #define cpu_pda(i) (_cpu_pda[i]) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943..0ea48a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,6 +1024,7 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +extern int after_bootmem; #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -- cgit v1.1 From 5deb0b2a25b7b568ab81f7c38052572ecf4ccc96 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: leave initial __cpu_pda array in place until cpus are booted Ingo Molnar wrote: ... > they crashed after about 3 randconfig iterations with: > > early res: 4 [8000-afff] PGTABLE > early res: 5 [b000-b87f] MEMNODEMAP > PANIC: early exception 0e rip 10:ffffffff8077a150 error 2 cr2 37 > Pid: 0, comm: swapper Not tainted 2.6.25-sched-devel.git-x86-latest.git #14 > > Call Trace: > [<ffffffff81466196>] early_idt_handler+0x56/0x6a > [<ffffffff8077a150>] ? numa_set_node+0x30/0x60 > [<ffffffff8077a129>] ? numa_set_node+0x9/0x60 > [<ffffffff8147a543>] numa_init_array+0x93/0xf0 > [<ffffffff8147b039>] acpi_scan_nodes+0x3b9/0x3f0 > [<ffffffff8147a496>] numa_initmem_init+0x136/0x150 > [<ffffffff8146da5f>] setup_arch+0x48f/0x700 > [<ffffffff802566ea>] ? clockevents_register_notifier+0x3a/0x50 > [<ffffffff81466a87>] start_kernel+0xd7/0x440 > [<ffffffff81466422>] x86_64_start_kernel+0x222/0x280 ... Here's the fixup... This one should follow the previous patches. Thanks, Mike Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/head64.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0ab59ed..4bcb61c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -29,17 +29,13 @@ static struct x8664_pda _boot_cpu_pda __read_mostly; #ifdef CONFIG_SMP -#ifdef CONFIG_DEBUG_PER_CPU_MAPS /* - * We install an empty cpu_pda pointer table to trap references before - * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + * We install an empty cpu_pda pointer table to indicate to early users + * (numa_set_node) that the cpu_pda pointer table for cpus other than + * the boot cpu is not yet setup. */ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; #else -static struct x8664_pda *__cpu_pda[1] __read_mostly; -#endif - -#else /* !CONFIG_SMP (NR_CPUS will be 1) */ static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif -- cgit v1.1 From f307d25e638d3408659a2ec98fb3fd1737f7cb31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 21 May 2008 11:21:13 +0100 Subject: x86: compile error fix for smpboot.c Without this patch, my link fails with: arch/x86/kernel/built-in.o(.cpuinit.text+0x3c6e): In function `get_local_pda': : undefined reference to `_cpu_pda' arch/x86/kernel/built-in.o(.cpuinit.text+0x3cd1): In function `get_local_pda': : undefined reference to `after_bootmem' arch/x86/kernel/built-in.o(.cpuinit.text+0x3cec): In function `get_local_pda': : undefined reference to `_cpu_pda' make[2]: *** [.tmp_vmlinux1] Error 1 Caused by commit 766da892634694f795b18b9538407816896fc470 x86: remove static boot_cpu_pda array v2 Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/x86/kernel/smpboot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bf08334..bc1e125 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +#ifdef CONFIG_X86_64 /* * Allocate node local memory for the AP pda. * @@ -852,6 +853,7 @@ static int __cpuinit get_local_pda(int cpu) cpu_pda(cpu) = newpda; return 0; } +#endif /* CONFIG_X86_64 */ static int __cpuinit do_boot_cpu(int apicid, int cpu) /* -- cgit v1.1 From 864fc31ea59798905a37cd896a3e093915a3b366 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: numa_64.c make local variables static plat_node_bdata, cmdline, nodemap_addr, nodemap_size are local to numa_64.c. Make them static Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/numa_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 14c7ab4..824344f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -27,7 +27,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -bootmem_data_t plat_node_bdata[MAX_NUMNODES]; +static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; @@ -36,8 +36,8 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { }; int numa_off __initdata; -unsigned long __initdata nodemap_addr; -unsigned long __initdata nodemap_size; +static unsigned long __initdata nodemap_addr; +static unsigned long __initdata nodemap_size; /* * Given a shift value, try to populate memnodemap[] @@ -296,7 +296,7 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -char *cmdline __initdata; +static char *cmdline __initdata; /* * Setups up nid to range from addr to addr + size. If the end -- cgit v1.1 From 886533a3e370a6d5c4e46819d1e14bd2f20dbb3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 12 May 2008 15:43:36 +0200 Subject: x86: numa_64.c fix shadowed variable sparse mutters: arch/x86/mm/numa_64.c:195:27: warning: symbol 'end_pfn' shadows an earlier one Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/numa_64.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 824344f..a1f3778b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start, void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); @@ -191,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, start, end); start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; + last_pfn = end >> PAGE_SHIFT; node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, SMP_CACHE_BYTES); @@ -204,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; /* * Find a place for the bootmem map @@ -213,7 +213,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, * early_node_mem will get that with find_e820_area instead * of alloc_bootmem, that could clash with reserved range */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); nid = phys_to_nid(nodedata_phys); if (nid == nodeid) bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); @@ -235,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + start_pfn, last_pfn); printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, @@ -400,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, } /* - * Sets up the system RAM area from start_pfn to end_pfn according to the + * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. */ static struct bootnode nodes[MAX_NUMNODES] __initdata; -static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) { u64 size, addr = start_pfn << PAGE_SHIFT; - u64 max_addr = end_pfn << PAGE_SHIFT; + u64 max_addr = last_pfn << PAGE_SHIFT; int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); @@ -514,7 +514,7 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn) { int i; @@ -522,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) + if (cmdline && !numa_emulation(start_pfn, last_pfn)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -530,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) + last_pfn << PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -538,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_K8_NUMA if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, - end_pfn<<PAGE_SHIFT)) + last_pfn<<PAGE_SHIFT)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -548,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT); + last_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; memnodemap = memnode.embedded_map; @@ -557,8 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - e820_register_active_regions(0, start_pfn, end_pfn); - setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + e820_register_active_regions(0, start_pfn, last_pfn); + setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); } unsigned long __init numa_free_all_bootmem(void) -- cgit v1.1 From 076ac2af86c3b7f89ac31bc50a7508d3e035b786 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Mon, 12 May 2008 21:21:12 +0200 Subject: sched, numa: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c, where appropriate. This saves some allocated space as well as many wasted cycles going through node entries that are non-existent. Signed-off-by: Mike Travis <travis@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 94ead43..bcc22b5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) min_val = INT_MAX; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Start at @node */ - n = (node + i) % MAX_NUMNODES; + n = (node + i) % nr_node_ids; if (!nr_cpus_node(n)) continue; @@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) if (!sched_group_nodes) continue; - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; *nodemask = node_to_cpumask(i); @@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), + sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); @@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { SCHED_CPUMASK_VAR(nodemask, allmasks); SCHED_CPUMASK_VAR(send_covered, allmasks); @@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, send_covered, tmpmask); } - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ struct sched_group *sg, *prev; SCHED_CPUMASK_VAR(nodemask, allmasks); @@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpus_or(*covered, *covered, *nodemask); prev = sg; - for (j = 0; j < MAX_NUMNODES; j++) { + for (j = 0; j < nr_node_ids; j++) { SCHED_CPUMASK_VAR(notcovered, allmasks); - int n = (i + j) % MAX_NUMNODES; + int n = (i + j) % nr_node_ids; node_to_cpumask_ptr(pnodemask, n); cpus_complement(*notcovered, *covered); @@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) + for (i = 0; i < nr_node_ids; i++) init_numa_sched_groups_power(sched_group_nodes[i]); if (sd_allnodes) { -- cgit v1.1 From 03db1f74a7d823e3de3767f36b1e08829f6fb3a1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum <vegard.nossum@gmail.com> Date: Fri, 6 Jun 2008 16:33:25 +0200 Subject: x86: don't return invalid pointers from node_to_cpumask() Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index dd12c1c..df49ce8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -350,6 +350,7 @@ cpumask_t *_node_to_cpumask_ptr(int node) dump_stack(); return &cpu_online_map; } + BUG_ON(node >= nr_node_ids); return &node_to_cpumask_map[node]; } EXPORT_SYMBOL(_node_to_cpumask_ptr); @@ -365,6 +366,7 @@ cpumask_t node_to_cpumask(int node) dump_stack(); return cpu_online_map; } + BUG_ON(node >= nr_node_ids); return node_to_cpumask_map[node]; } EXPORT_SYMBOL(node_to_cpumask); -- cgit v1.1 From 053713f5745b8b08fb598adb65230bc168cb9d8d Mon Sep 17 00:00:00 2001 From: Randy Dunlap <randy.dunlap@oracle.com> Date: Thu, 5 Jun 2008 11:10:59 -0700 Subject: x86: fix setup.c printk format warning Fix setup.c printk format warning: linux-next-20080605/arch/x86/kernel/setup.c: In function 'setup_per_cpu_areas': linux-next-20080605/arch/x86/kernel/setup.c:173: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ssize_t' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index df49ce8..d4eaa4e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -169,7 +169,7 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; - printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { -- cgit v1.1 From 3fd052b1b46ac23a2316283a996fe6c32dbcf132 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Sun, 8 Jun 2008 15:46:30 +0200 Subject: x86: add flags parameter to reserve_bootmem_generic() This patch adds a 'flags' parameter to reserve_bootmem_generic() like it already has been added in reserve_bootmem() with commit 72a7fe3967dbf86cb34e24fbf1d957fe24d2f246. It also changes all users to use BOOTMEM_DEFAULT, which doesn't effectively change the behaviour. Since the change is x86-specific, I don't think it's necessary to add a new API for migration. There are only 4 users of that function. The change is necessary for the next patch, using reserve_bootmem_generic() for crashkernel reservation. Signed-off-by: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820_64.c | 3 ++- arch/x86/kernel/efi_64.c | 3 ++- arch/x86/kernel/mpparse.c | 5 +++-- arch/x86/mm/init_64.c | 17 ++++++++++++----- include/asm-x86/proto.h | 2 +- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 124480c..af1eb07 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -118,7 +118,8 @@ void __init early_res_to_bootmem(unsigned long start, unsigned long end) continue; printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, final_start, final_end - 1, r->name); - reserve_bootmem_generic(final_start, final_end - final_start); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); } } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index d0060fd..d561dd5 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -100,7 +100,8 @@ void __init efi_call_phys_epilog(void) void __init efi_reserve_bootmem(void) { reserve_bootmem_generic((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); + memmap.nr_map * memmap.desc_size, + BOOTMEM_DEFAULT); } void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 404683b..4901ae3 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -729,10 +729,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) reserve_bootmem_generic(mpf->mpf_physptr, - PAGE_SIZE); + PAGE_SIZE, BOOTMEM_DEFAULT); #endif return 1; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 819dad9..bf7bf1d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,12 +799,13 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) { #ifdef CONFIG_NUMA int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; + int ret; if (pfn >= end_pfn) { /* @@ -812,11 +813,11 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) * firmware tables: */ if (pfn < max_pfn_mapped) - return; + return -EFAULT; printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); - return; + return -EFAULT; } /* Should check here against the e820 map to avoid double free */ @@ -824,9 +825,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) nid = phys_to_nid(phys); next_nid = phys_to_nid(phys + len - 1); if (nid == next_nid) - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + ret = reserve_bootmem(phys, len, flags); + + if (ret != 0) + return ret; + #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif @@ -835,6 +840,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); } + + return 0; } int kern_addr_valid(unsigned long addr) diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h index 6c8b41b..a9f5147 100644 --- a/include/asm-x86/proto.h +++ b/include/asm-x86/proto.h @@ -14,7 +14,7 @@ extern void ia32_syscall(void); extern void ia32_cstar_target(void); extern void ia32_sysenter_target(void); -extern void reserve_bootmem_generic(unsigned long phys, unsigned len); +extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags); extern void syscall32_cpu_init(void); -- cgit v1.1 From 46f68e1c6b04a04772e828ff3bcd07ed708805c2 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Sun, 8 Jun 2008 15:46:31 +0200 Subject: x86: use reserve_bootmem_generic() to reserve crashkernel memory on x86_64 This patch uses reserve_bootmem_generic() instead of reserve_bootmem() to reserve the crashkernel memory on x86_64. That's necessary for NUMA machines, see 00212fef814612245ed0261cbac8426d0c9a31a5: [PATCH] Fix kdump Crash Kernel boot memory reservation for NUMA machines This patch will fix a boot memory reservation bug that trashes memory on the ES7000 when loading the kdump crash kernel. The code in arch/x86_64/kernel/setup.c to reserve boot memory for the crash kernel uses the non-numa aware "reserve_bootmem" function instead of the NUMA aware "reserve_bootmem_generic". I checked to make sure that no other function was using "reserve_bootmem" and found none, except the ones that had NUMA ifdef'ed out. I have tested this patch only on an ES7000 with NUMA on and off (numa=off) in a single (non-NUMA) and multi-cell (NUMA) configurations. Signed-off-by: Amul Shah <amul.shah@unisys.com> Looks-good-to: Vivek Goyal <vgoyal@in.ibm.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> The switch-back to reserve_bootmem() was accidentally introduced in 5c3391f9f749023a49c64d607da4fb49263690eb when adding the BOOTMEM_EXCLUSIVE parameter. Signed-off-by: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index e8df64f..4a666cd 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -243,7 +243,7 @@ static void __init reserve_crashkernel(void) return; } - if (reserve_bootmem(crash_base, crash_size, + if (reserve_bootmem_generic(crash_base, crash_size, BOOTMEM_EXCLUSIVE) < 0) { printk(KERN_INFO "crashkernel reservation failed - " "memory is in use\n"); -- cgit v1.1 From 1812924bb1823950c1dc95c478b71b037057356e Mon Sep 17 00:00:00 2001 From: Cliff Wickman <cpw@sgi.com> Date: Mon, 2 Jun 2008 08:56:14 -0500 Subject: x86, SGI UV: TLB shootdown using broadcast assist unit TLB shootdown for SGI UV. Depends on patch (in tip/x86/irq): x86-update-macros-used-by-uv-platform.patch Jack Steiner May 29 This patch provides the ability to flush TLB's in cpu's that are not on the local node. The hardware mechanism for distributing the flush messages is the UV's "broadcast assist unit". The hook to intercept TLB shootdown requests is a 2-line change to native_flush_tlb_others() (arch/x86/kernel/tlb_64.c). This code has been tested on a hardware simulator. The real hardware is not yet available. The shootdown statistics are provided through /proc/sgi_uv/ptc_statistics. The use of /sys was considered, but would have required the use of many /sys files. The debugfs was also considered, but these statistics should be available on an ongoing basis, not just for debugging. Issues to be fixed later: - The IRQ for the messaging interrupt is currently hardcoded as 200 (see UV_BAU_MESSAGE). It should be dynamically assigned in the future. - The use of appropriate udelay()'s is untested, as they are a problem in the simulator. Signed-off-by: Cliff Wickman <cpw@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_64.S | 4 + arch/x86/kernel/tlb_64.c | 5 + arch/x86/kernel/tlb_uv.c | 736 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/uv/uv_bau.h | 331 ++++++++++++++++++++ 5 files changed, 1077 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/tlb_uv.c create mode 100644 include/asm-x86/uv/uv_bau.h diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 605995e..4078e98 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_OLPC) += olpc.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o + obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df..6fd1987 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -720,6 +720,10 @@ ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) +ENTRY(uv_bau_message_intr1) + apicinterrupt 220,uv_bau_message_interrupt +END(uv_bau_message_intr1) + ENTRY(error_interrupt) apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt END(error_interrupt) diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index a1f07d7..fc13211 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -15,6 +15,8 @@ #include <asm/proto.h> #include <asm/apicdef.h> #include <asm/idle.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_bau.h> #include <mach_ipi.h> /* @@ -162,6 +164,9 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, union smp_flush_state *f; cpumask_t cpumask = *cpumaskp; + if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) + return; + /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c new file mode 100644 index 0000000..28e7c68 --- /dev/null +++ b/arch/x86/kernel/tlb_uv.c @@ -0,0 +1,736 @@ +/* + * SGI UltraViolet TLB flush routines. + * + * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. + * + * This code is released under the GNU General Public License version 2 or + * later. + */ +#include <linux/mc146818rtc.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> + +#include <asm/mach-bigsmp/mach_apic.h> +#include <asm/mmu_context.h> +#include <asm/idle.h> +#include <asm/genapic.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_mmrs.h> +#include <asm/uv/uv_bau.h> + +struct bau_control **uv_bau_table_bases; +static int uv_bau_retry_limit; +static int uv_nshift; /* position of pnode (which is nasid>>1) */ +static unsigned long uv_mmask; + +char *status_table[] = { + "IDLE", + "ACTIVE", + "DESTINATION TIMEOUT", + "SOURCE TIMEOUT" +}; + +DEFINE_PER_CPU(struct ptc_stats, ptcstats); +DEFINE_PER_CPU(struct bau_control, bau_control); + +/* + * Free a software acknowledge hardware resource by clearing its Pending + * bit. This will return a reply to the sender. + * If the message has timed out, a reply has already been sent by the + * hardware but the resource has not been released. In that case our + * clear of the Timeout bit (as well) will free the resource. No reply will + * be sent (the hardware will only do one reply per message). + */ +static void +uv_reply_to_message(int resource, + struct bau_payload_queue_entry *msg, + struct bau_msg_status *msp) +{ + int fw; + + fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + msg->replied_to = 1; + msg->sw_ack_vector = 0; + if (msp) + msp->seen_by.bits = 0; + uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw); + return; +} + +/* + * Do all the things a cpu should do for a TLB shootdown message. + * Other cpu's may come here at the same time for this message. + */ +static void +uv_bau_process_message(struct bau_payload_queue_entry *msg, + int msg_slot, int sw_ack_slot) +{ + int cpu; + unsigned long this_cpu_mask; + struct bau_msg_status *msp; + + msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; + cpu = uv_blade_processor_id(); + msg->number_of_cpus = + uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); + this_cpu_mask = (unsigned long)1 << cpu; + if (msp->seen_by.bits & this_cpu_mask) + return; + atomic_or_long(&msp->seen_by.bits, this_cpu_mask); + + if (msg->replied_to == 1) + return; + + if (msg->address == TLB_FLUSH_ALL) { + local_flush_tlb(); + __get_cpu_var(ptcstats).alltlb++; + } else { + __flush_tlb_one(msg->address); + __get_cpu_var(ptcstats).onetlb++; + } + + __get_cpu_var(ptcstats).requestee++; + + atomic_inc_short(&msg->acknowledge_count); + if (msg->number_of_cpus == msg->acknowledge_count) + uv_reply_to_message(sw_ack_slot, msg, msp); + return; +} + +/* + * Examine the payload queue on all the distribution nodes to see + * which messages have not been seen, and which cpu(s) have not seen them. + * + * Returns the number of cpu's that have not responded. + */ +static int +uv_examine_destinations(struct bau_target_nodemask *distribution) +{ + int sender; + int i; + int j; + int k; + int count = 0; + struct bau_control *bau_tablesp; + struct bau_payload_queue_entry *msg; + struct bau_msg_status *msp; + + sender = smp_processor_id(); + for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE); + i++) { + if (bau_node_isset(i, distribution)) { + bau_tablesp = uv_bau_table_bases[i]; + for (msg = bau_tablesp->va_queue_first, j = 0; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) { + if ((msg->sending_cpu == sender) && + (!msg->replied_to)) { + msp = bau_tablesp->msg_statuses + j; + printk(KERN_DEBUG + "blade %d: address:%#lx %d of %d, not cpu(s): ", + i, msg->address, + msg->acknowledge_count, + msg->number_of_cpus); + for (k = 0; k < msg->number_of_cpus; + k++) { + if (!((long)1 << k & msp-> + seen_by.bits)) { + count++; + printk("%d ", k); + } + } + printk("\n"); + } + } + } + } + return count; +} + +/** + * uv_flush_tlb_others - globally purge translation cache of a virtual + * address or all TLB's + * @cpumaskp: mask of all cpu's in which the address is to be removed + * @mm: mm_struct containing virtual address range + * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * + * This is the entry point for initiating any UV global TLB shootdown. + * + * Purges the translation caches of all specified processors of the given + * virtual address, or purges all TLB's on specified processors. + * + * The caller has derived the cpumaskp from the mm_struct and has subtracted + * the local cpu from the mask. This function is called only if there + * are bits set in the mask. (e.g. flush_tlb_page()) + * + * The cpumaskp is converted into a nodemask of the nodes containing + * the cpus. + */ +int +uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va) +{ + int i; + int blade; + int cpu; + int bit; + int right_shift; + int this_blade; + int exams = 0; + int tries = 0; + long source_timeouts = 0; + long destination_timeouts = 0; + unsigned long index; + unsigned long mmr_offset; + unsigned long descriptor_status; + struct bau_activation_descriptor *bau_desc; + ktime_t time1, time2; + + cpu = uv_blade_processor_id(); + this_blade = uv_numa_blade_id(); + bau_desc = __get_cpu_var(bau_control).descriptor_base; + bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu); + + bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + + i = 0; + for_each_cpu_mask(bit, *cpumaskp) { + blade = uv_cpu_to_blade_id(bit); + if (blade > (UV_DISTRIBUTION_SIZE - 1)) + BUG(); + if (blade == this_blade) + continue; + bau_node_set(blade, &bau_desc->distribution); + /* leave the bits for the remote cpu's in the mask until + success; on failure we fall back to the IPI method */ + i++; + } + if (i == 0) + goto none_to_flush; + __get_cpu_var(ptcstats).requestor++; + __get_cpu_var(ptcstats).ntargeted += i; + + bau_desc->payload.address = va; + bau_desc->payload.sending_cpu = smp_processor_id(); + + if (cpu < UV_CPUS_PER_ACT_STATUS) { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + } else { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + right_shift = + ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); + } + time1 = ktime_get(); + +retry: + tries++; + index = ((unsigned long) + 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu; + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + + while ((descriptor_status = (((unsigned long) + uv_read_local_mmr(mmr_offset) >> + right_shift) & UV_ACT_STATUS_MASK)) != + DESC_STATUS_IDLE) { + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + source_timeouts++; + if (source_timeouts > SOURCE_TIMEOUT_LIMIT) + source_timeouts = 0; + __get_cpu_var(ptcstats).s_retry++; + goto retry; + } + /* spin here looking for progress at the destinations */ + if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { + destination_timeouts++; + if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { + /* returns # of cpus not responding */ + if (uv_examine_destinations + (&bau_desc->distribution) == 0) { + __get_cpu_var(ptcstats).d_retry++; + goto retry; + } + exams++; + if (exams >= uv_bau_retry_limit) { + printk(KERN_DEBUG + "uv_flush_tlb_others"); + printk("giving up on cpu %d\n", + smp_processor_id()); + goto unsuccessful; + } + /* delays can hang up the simulator + udelay(1000); + */ + destination_timeouts = 0; + } + } + } + if (tries > 1) + __get_cpu_var(ptcstats).retriesok++; + /* on success, clear the remote cpu's from the mask so we don't + use the IPI method of shootdown on them */ + for_each_cpu_mask(bit, *cpumaskp) { + blade = uv_cpu_to_blade_id(bit); + if (blade == this_blade) + continue; + cpu_clear(bit, *cpumaskp); + } + +unsuccessful: + time2 = ktime_get(); + __get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64); + +none_to_flush: + if (cpus_empty(*cpumaskp)) + return 1; + + /* Cause the caller to do an IPI-style TLB shootdown on + the cpu's still in the mask */ + __get_cpu_var(ptcstats).ptc_i++; + return 0; +} + +/* + * The BAU message interrupt comes here. (registered by set_intr_gate) + * See entry_64.S + * + * We received a broadcast assist message. + * + * Interrupts may have been disabled; this interrupt could represent + * the receipt of several messages. + * + * All cores/threads on this node get this interrupt. + * The last one to see it does the s/w ack. + * (the resource will not be freed until noninterruptable cpus see this + * interrupt; hardware will timeout the s/w ack and reply ERROR) + */ +void +uv_bau_message_interrupt(struct pt_regs *regs) +{ + struct bau_payload_queue_entry *pqp; + struct bau_payload_queue_entry *msg; + struct pt_regs *old_regs = set_irq_regs(regs); + ktime_t time1, time2; + int msg_slot; + int sw_ack_slot; + int fw; + int count = 0; + unsigned long local_pnode; + + ack_APIC_irq(); + exit_idle(); + irq_enter(); + + time1 = ktime_get(); + + local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); + + pqp = __get_cpu_var(bau_control).va_queue_first; + msg = __get_cpu_var(bau_control).bau_msg_head; + while (msg->sw_ack_vector) { + count++; + fw = msg->sw_ack_vector; + msg_slot = msg - pqp; + sw_ack_slot = ffs(fw) - 1; + + uv_bau_process_message(msg, msg_slot, sw_ack_slot); + + msg++; + if (msg > __get_cpu_var(bau_control).va_queue_last) + msg = __get_cpu_var(bau_control).va_queue_first; + __get_cpu_var(bau_control).bau_msg_head = msg; + } + if (!count) + __get_cpu_var(ptcstats).nomsg++; + else if (count > 1) + __get_cpu_var(ptcstats).multmsg++; + + time2 = ktime_get(); + __get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64); + + irq_exit(); + set_irq_regs(old_regs); + return; +} + +static void +uv_enable_timeouts(void) +{ + int i; + int blade; + int last_blade; + int pnode; + int cur_cpu = 0; + unsigned long apicid; + + /* better if we had each_online_blade */ + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); + pnode = uv_blade_to_pnode(blade); + cur_cpu += uv_blade_nr_possible_cpus(i); + } + return; +} + +static void * +uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +{ + if (*offset < num_possible_cpus()) + return offset; + return NULL; +} + +static void * +uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + if (*offset < num_possible_cpus()) + return offset; + return NULL; +} + +static void +uv_ptc_seq_stop(struct seq_file *file, void *data) +{ +} + +/* + * Display the statistics thru /proc + * data points to the cpu number + */ +static int +uv_ptc_seq_show(struct seq_file *file, void *data) +{ + struct ptc_stats *stat; + int cpu; + + cpu = *(loff_t *)data; + + if (!cpu) { + seq_printf(file, + "# cpu requestor requestee one all sretry dretry ptc_i "); + seq_printf(file, + "sw_ack sflush_us dflush_us sok dnomsg dmult starget\n"); + } + if (cpu < num_possible_cpus() && cpu_online(cpu)) { + stat = &per_cpu(ptcstats, cpu); + seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", + cpu, stat->requestor, + stat->requestee, stat->onetlb, stat->alltlb, + stat->s_retry, stat->d_retry, stat->ptc_i); + seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", + uv_read_global_mmr64(uv_blade_to_pnode + (uv_cpu_to_blade_id(cpu)), + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), + stat->sflush_ns / 1000, stat->dflush_ns / 1000, + stat->retriesok, stat->nomsg, + stat->multmsg, stat->ntargeted); + } + + return 0; +} + +/* + * 0: display meaning of the statistics + * >0: retry limit + */ +static ssize_t +uv_ptc_proc_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + long newmode; + char optstr[64]; + + if (copy_from_user(optstr, user, count)) + return -EFAULT; + optstr[count - 1] = '\0'; + if (strict_strtoul(optstr, 10, &newmode) < 0) { + printk(KERN_DEBUG "%s is invalid\n", optstr); + return -EINVAL; + } + + if (newmode == 0) { + printk(KERN_DEBUG "# cpu: cpu number\n"); + printk(KERN_DEBUG + "requestor: times this cpu was the flush requestor\n"); + printk(KERN_DEBUG + "requestee: times this cpu was requested to flush its TLBs\n"); + printk(KERN_DEBUG + "one: times requested to flush a single address\n"); + printk(KERN_DEBUG + "all: times requested to flush all TLB's\n"); + printk(KERN_DEBUG + "sretry: number of retries of source-side timeouts\n"); + printk(KERN_DEBUG + "dretry: number of retries of destination-side timeouts\n"); + printk(KERN_DEBUG + "ptc_i: times UV fell through to IPI-style flushes\n"); + printk(KERN_DEBUG + "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); + printk(KERN_DEBUG + "sflush_us: microseconds spent in uv_flush_tlb_others()\n"); + printk(KERN_DEBUG + "dflush_us: microseconds spent in handling flush requests\n"); + printk(KERN_DEBUG "sok: successes on retry\n"); + printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); + printk(KERN_DEBUG + "dmult: interrupts with multiple messages\n"); + printk(KERN_DEBUG "starget: nodes targeted\n"); + } else { + uv_bau_retry_limit = newmode; + printk(KERN_DEBUG "timeout retry limit:%d\n", + uv_bau_retry_limit); + } + + return count; +} + +static const struct seq_operations uv_ptc_seq_ops = { + .start = uv_ptc_seq_start, + .next = uv_ptc_seq_next, + .stop = uv_ptc_seq_stop, + .show = uv_ptc_seq_show +}; + +static int +uv_ptc_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &uv_ptc_seq_ops); +} + +static const struct file_operations proc_uv_ptc_operations = { + .open = uv_ptc_proc_open, + .read = seq_read, + .write = uv_ptc_proc_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *proc_uv_ptc; + +static int __init +uv_ptc_init(void) +{ + static struct proc_dir_entry *sgi_proc_dir; + + sgi_proc_dir = NULL; + + if (!is_uv_system()) + return 0; + + sgi_proc_dir = proc_mkdir("sgi_uv", NULL); + if (!sgi_proc_dir) + return -EINVAL; + + proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + if (!proc_uv_ptc) { + printk(KERN_ERR "unable to create %s proc entry\n", + UV_PTC_BASENAME); + return -EINVAL; + } + proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; + return 0; +} + +static void __exit +uv_ptc_exit(void) +{ + remove_proc_entry(UV_PTC_BASENAME, NULL); +} + +module_init(uv_ptc_init); +module_exit(uv_ptc_exit); + +/* + * Initialization of BAU-related structures + */ +int __init +uv_bau_init(void) +{ + int i; + int j; + int blade; + int nblades; + int *ip; + int pnode; + int last_blade; + int cur_cpu = 0; + unsigned long pa; + unsigned long n; + unsigned long m; + unsigned long mmr_image; + unsigned long apicid; + char *cp; + struct bau_control *bau_tablesp; + struct bau_activation_descriptor *adp, *ad2; + struct bau_payload_queue_entry *pqp; + struct bau_msg_status *msp; + struct bau_control *bcp; + + if (!is_uv_system()) + return 0; + + uv_bau_retry_limit = 1; + + if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) < + MAX_CPUS_PER_NODE) { + printk(KERN_ERR + "uv_bau_init: bau_local_cpumask.bits too small\n"); + BUG(); + } + + uv_nshift = uv_hub_info->n_val; + uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1; + nblades = 0; + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + nblades++; + } + + uv_bau_table_bases = (struct bau_control **) + kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); + if (!uv_bau_table_bases) + BUG(); + + /* better if we had each_online_blade */ + last_blade = -1; + for_each_online_node(i) { + blade = uv_node_to_blade_id(i); + if (blade == last_blade) + continue; + last_blade = blade; + + bau_tablesp = + kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i); + if (!bau_tablesp) + BUG(); + + bau_tablesp->msg_statuses = + kmalloc_node(sizeof(struct bau_msg_status) * + DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i); + if (!bau_tablesp->msg_statuses) + BUG(); + for (j = 0, msp = bau_tablesp->msg_statuses; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) { + bau_cpubits_clear(&msp->seen_by, (int) + uv_blade_nr_possible_cpus(blade)); + } + + bau_tablesp->watching = + kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES, + GFP_KERNEL, i); + if (!bau_tablesp->watching) + BUG(); + for (j = 0, ip = bau_tablesp->watching; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) { + *ip = 0; + } + + uv_bau_table_bases[i] = bau_tablesp; + + pnode = uv_blade_to_pnode(blade); + + if (sizeof(struct bau_activation_descriptor) != 64) + BUG(); + + adp = (struct bau_activation_descriptor *) + kmalloc_node(16384, GFP_KERNEL, i); + if (!adp) + BUG(); + if ((unsigned long)adp & 0xfff) + BUG(); + pa = __pa((unsigned long)adp); + n = pa >> uv_nshift; + m = pa & uv_mmask; + + mmr_image = uv_read_global_mmr64(pnode, + UVH_LB_BAU_SB_DESCRIPTOR_BASE); + if (mmr_image) + uv_write_global_mmr64(pnode, (unsigned long) + UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | + m)); + for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE; + j++, ad2++) { + memset(ad2, 0, + sizeof(struct bau_activation_descriptor)); + ad2->header.sw_ack_flag = 1; + ad2->header.base_dest_nodeid = + uv_blade_to_pnode(uv_cpu_to_blade_id(0)); + ad2->header.command = UV_NET_ENDPOINT_INTD; + ad2->header.int_both = 1; + /* all others need to be set to zero: + fairness chaining multilevel count replied_to */ + } + + pqp = (struct bau_payload_queue_entry *) + kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) * + sizeof(struct bau_payload_queue_entry), + GFP_KERNEL, i); + if (!pqp) + BUG(); + if (sizeof(struct bau_payload_queue_entry) != 32) + BUG(); + if ((unsigned long)(&((struct bau_payload_queue_entry *)0)-> + sw_ack_vector) != 15) + BUG(); + + cp = (char *)pqp + 31; + pqp = (struct bau_payload_queue_entry *) + (((unsigned long)cp >> 5) << 5); + bau_tablesp->va_queue_first = pqp; + uv_write_global_mmr64(pnode, + UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, + ((unsigned long)pnode << + UV_PAYLOADQ_PNODE_SHIFT) | + uv_physnodeaddr(pqp)); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, + uv_physnodeaddr(pqp)); + bau_tablesp->va_queue_last = + pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, + (unsigned long) + uv_physnodeaddr(bau_tablesp-> + va_queue_last)); + memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * + DESTINATION_PAYLOAD_QUEUE_SIZE); + + /* this initialization can't be in firmware because the + messaging IRQ will be determined by the OS */ + apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); + pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); + if ((pa & 0xff) != UV_BAU_MESSAGE) { + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + ((apicid << 32) | + UV_BAU_MESSAGE)); + } + + for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i)); + j++) { + bcp = (struct bau_control *)&per_cpu(bau_control, j); + bcp->bau_msg_head = bau_tablesp->va_queue_first; + bcp->va_queue_first = bau_tablesp->va_queue_first; + + bcp->va_queue_last = bau_tablesp->va_queue_last; + bcp->watching = bau_tablesp->watching; + bcp->msg_statuses = bau_tablesp->msg_statuses; + bcp->descriptor_base = adp; + } + cur_cpu += uv_blade_nr_possible_cpus(i); + } + + set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + + uv_enable_timeouts(); + + return 0; +} + +__initcall(uv_bau_init); diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h new file mode 100644 index 0000000..f125f86 --- /dev/null +++ b/include/asm-x86/uv/uv_bau.h @@ -0,0 +1,331 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV Broadcast Assist Unit definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef __ASM_X86_UV_BAU__ +#define __ASM_X86_UV_BAU__ + +#include <linux/bitmap.h> +#define BITSPERBYTE 8 + +/* Broadcast Assist Unit messaging structures */ + +/* + * Selective Broadcast activations are induced by software action + * specifying a particular 8-descriptor "set" via a 6-bit index written + * to an MMR. + * Thus there are 64 unique 512-byte sets of SB descriptors - one set for + * each 6-bit index value. These descriptor sets are mapped in sequence + * starting with set 0 located at the address specified in the + * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, + * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. + * + * We will use 31 sets, one for sending BAU messages from each of the 32 + * cpu's on the node. + * + * TLB shootdown will use the first of the 8 descriptors of each set. + * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). + */ + +#define UV_ITEMS_PER_DESCRIPTOR 8 +#define UV_CPUS_PER_ACT_STATUS 32 +#define UV_ACT_STATUS_MASK 0x3 +#define UV_ACT_STATUS_SIZE 2 +#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_DISTRIBUTION_SIZE 256 +#define UV_SW_ACK_NPENDING 8 +#define UV_BAU_MESSAGE 200 /* Messaging irq; see irq_64.h */ + /* and include/asm-x86/hw_irq_64.h */ + /* To be dynamically allocated in the future */ +#define UV_NET_ENDPOINT_INTD 0x38 +#define UV_DESC_BASE_PNODE_SHIFT 49 /* position of pnode (nasid>>1) in MMR */ +#define UV_PAYLOADQ_PNODE_SHIFT 49 + +#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) + +/* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ +#define DESC_STATUS_IDLE 0 +#define DESC_STATUS_ACTIVE 1 +#define DESC_STATUS_DESTINATION_TIMEOUT 2 +#define DESC_STATUS_SOURCE_TIMEOUT 3 + +/* source side threshholds at which message retries print a warning */ +#define SOURCE_TIMEOUT_LIMIT 20 +#define DESTINATION_TIMEOUT_LIMIT 20 + +/* number of entries in the destination side payload queue */ +#define DESTINATION_PAYLOAD_QUEUE_SIZE 17 +/* number of destination side software ack resources */ +#define DESTINATION_NUM_RESOURCES 8 +#define MAX_CPUS_PER_NODE 32 + +/* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) */ +/* If the 'multilevel' flag in the header portion of the descriptor + * has been set to 0, then endpoint multi-unicast mode is selected. + * The distribution specification (32 bytes) is interpreted as a 256-bit + * distribution vector. Adjacent bits correspond to consecutive even numbered + * nodeIDs. The result of adding the index of a given bit to the 15-bit + * 'base_dest_nodeid' field of the header corresponds to the + * destination nodeID associated with that specified bit. */ +struct bau_target_nodemask { + unsigned long bits[BITS_TO_LONGS(256)]; +}; + +/* mask of cpu's on a node */ +/* (during initialization we need to check that unsigned long has + enough bits for max. cpu's per node) */ +struct bau_local_cpumask { + unsigned long bits; +}; + +/* + * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor) + * only 12 bytes (96 bits) of the payload area are usable. + * An additional 3 bytes (bits 27:4) of the header address are carried + * to the next bytes of the destination payload queue. + * And an additional 2 bytes of the header Suppl_A field are also + * carried to the destination payload queue. + * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte) + * of the destination payload queue, which is written by the hardware + * with the s/w ack resource bit vector. + * [ effective message contents (16 bytes (128 bits) maximum), not counting + * the s/w ack bit vector ] + */ + +/* The payload is software-defined for INTD transactions */ +struct bau_msg_payload { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits */ + unsigned short sending_cpu; /* filled in by sender */ + /* 16 bits */ + unsigned short acknowledge_count;/* filled in by destination */ + /* 16 bits */ + unsigned int reserved1:32; /* not usable */ +}; + + +/* Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) */ +/* see table 4.2.3.0.1 in broacast_assist spec. */ +struct bau_msg_header { + int dest_subnodeid:6; /* must be zero */ + /* bits 5:0 */ + int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ + /* bits 20:6 */ + int command:8; /* message type */ + /* bits 28:21 */ + /* 0x38: SN3net EndPoint Message */ + int rsvd_1:3; /* must be zero */ + /* bits 31:29 */ + /* int will align on 32 bits */ + int rsvd_2:9; /* must be zero */ + /* bits 40:32 */ + /* Suppl_A is 56-41 */ + int payload_2a:8; /* becomes byte 16 of msg */ + /* bits 48:41 */ /* not currently using */ + int payload_2b:8; /* becomes byte 17 of msg */ + /* bits 56:49 */ /* not currently using */ + /* Address field (96:57) is never used as an + address (these are address bits 42:3) */ + int rsvd_3:1; /* must be zero */ + /* bit 57 */ + /* address bits 27:4 are payload */ + /* these 24 bits become bytes 12-14 of msg */ + int replied_to:1; /* sent as 0 by the source to byte 12 */ + /* bit 58 */ + + int payload_1a:5; /* not currently used */ + /* bits 63:59 */ + int payload_1b:8; /* not currently used */ + /* bits 71:64 */ + int payload_1c:8; /* not currently used */ + /* bits 79:72 */ + int payload_1d:2; /* not currently used */ + /* bits 81:80 */ + + int rsvd_4:7; /* must be zero */ + /* bits 88:82 */ + int sw_ack_flag:1; /* software acknowledge flag */ + /* bit 89 */ + /* INTD trasactions at destination are to + wait for software acknowledge */ + int rsvd_5:6; /* must be zero */ + /* bits 95:90 */ + int rsvd_6:5; /* must be zero */ + /* bits 100:96 */ + int int_both:1; /* if 1, interrupt both sockets on the blade */ + /* bit 101*/ + int fairness:3; /* usually zero */ + /* bits 104:102 */ + int multilevel:1; /* multi-level multicast format */ + /* bit 105 */ + /* 0 for TLB: endpoint multi-unicast messages */ + int chaining:1; /* next descriptor is part of this activation*/ + /* bit 106 */ + int rsvd_7:21; /* must be zero */ + /* bits 127:107 */ +}; + +/* The format of the message to send, plus all accompanying control */ +/* Should be 64 bytes */ +struct bau_activation_descriptor { + struct bau_target_nodemask distribution; + /* message template, consisting of header and payload: */ + struct bau_msg_header header; + struct bau_msg_payload payload; +}; +/* + * -payload-- ---------header------ + * bytes 0-11 bits 41-56 bits 58-81 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ + +/* + * The payload queue on the destination side is an array of these. + * With BAU_MISC_CONTROL set for software acknowledge mode, the messages + * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 + * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) + * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from + * sw_ack_vector and payload_2) + * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software + * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload + * operation." + */ +struct bau_payload_queue_entry { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ + /* 64 bits, bytes 0-7 */ + + unsigned short sending_cpu; /* cpu that sent the message */ + /* 16 bits, bytes 8-9 */ + + unsigned short acknowledge_count; /* filled in by destination */ + /* 16 bits, bytes 10-11 */ + + unsigned short replied_to:1; /* sent as 0 by the source */ + /* 1 bit */ + unsigned short unused1:7; /* not currently using */ + /* 7 bits: byte 12) */ + + unsigned char unused2[2]; /* not currently using */ + /* bytes 13-14 */ + + unsigned char sw_ack_vector; /* filled in by the hardware */ + /* byte 15 (bits 127:120) */ + + unsigned char unused4[3]; /* not currently using bytes 17-19 */ + /* bytes 17-19 */ + + int number_of_cpus; /* filled in at destination */ + /* 32 bits, bytes 20-23 (aligned) */ + + unsigned char unused5[8]; /* not using */ + /* bytes 24-31 */ +}; + +/* one for every slot in the destination payload queue */ +struct bau_msg_status { + struct bau_local_cpumask seen_by; /* map of cpu's */ +}; + +/* one for every slot in the destination software ack resources */ +struct bau_sw_ack_status { + struct bau_payload_queue_entry *msg; /* associated message */ + int watcher; /* cpu monitoring, or -1 */ +}; + +/* one on every node and per-cpu; to locate the software tables */ +struct bau_control { + struct bau_activation_descriptor *descriptor_base; + struct bau_payload_queue_entry *bau_msg_head; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; + struct bau_msg_status *msg_statuses; + int *watching; /* pointer to array */ +}; + +/* + * This structure is allocated per_cpu for UV TLB shootdown statistics. + */ +struct ptc_stats { + unsigned long ptc_i; /* number of IPI-style flushes */ + unsigned long requestor; /* number of nodes this cpu sent to */ + unsigned long requestee; /* times cpu was remotely requested */ + unsigned long alltlb; /* times all tlb's on this cpu were flushed */ + unsigned long onetlb; /* times just one tlb on this cpu was flushed */ + unsigned long s_retry; /* retries on source side timeouts */ + unsigned long d_retry; /* retries on destination side timeouts */ + unsigned long sflush_ns;/* nanoseconds spent in uv_flush_tlb_others */ + unsigned long dflush_ns;/* nanoseconds spent destination side */ + unsigned long retriesok; /* successes on retries */ + unsigned long nomsg; /* interrupts with no message */ + unsigned long multmsg; /* interrupts with multiple messages */ + unsigned long ntargeted;/* nodes targeted */ +}; + +static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) +{ + return constant_test_bit(node, &dstp->bits[0]); +} +static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) +{ + __set_bit(node, &dstp->bits[0]); +} +static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits[0], nbits); +} + +static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) +{ + bitmap_zero(&dstp->bits, nbits); +} + +/* + * atomic increment of a short integer + * (rather than using the __sync_add_and_fetch() intrinsic) + * + * returns the new value of the variable + */ +static inline short int atomic_inc_short(short int *v) +{ + asm volatile("movw $1, %%cx\n" + "lock ; xaddw %%cx, %0\n" + : "+m" (*v) /* outputs */ + : : "%cx", "memory"); /* inputs : clobbereds */ + return *v; +} + +/* + * atomic OR of two long integers + * (rather than using the __sync_or_and_fetch() intrinsic) + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm volatile("movq %0, %%rax; lea %1, %%rdx\n" + "lock ; orq %%rax, %%rdx\n" + : "+m" (*v1) /* outputs */ + : "m" (v1), "m" (v2) /* inputs */ + : "memory"); /* clobbereds */ +} + +#define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) + +int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); +void uv_bau_message_intr1(void); +void uv_bau_timeout_intr1(void); + +#endif /* __ASM_X86_UV_BAU__ */ -- cgit v1.1 From 73e991f45fe7644711c0c9dd357a1a2c6e222707 Mon Sep 17 00:00:00 2001 From: Cliff Wickman <cpw@sgi.com> Date: Wed, 4 Jun 2008 15:33:17 -0500 Subject: x86 atomic operations: atomic_or_long() atomic_inc_short() Provide atomic operations for increment of a 16-bit integer and logical OR into a 64-bit integer. Signed-off-by: Cliff Wickman <cpw@sgi.com> Reviewed-by: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/atomic_64.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/asm-x86/atomic_64.h b/include/asm-x86/atomic_64.h index 3e0cd7d..55c0dd9 100644 --- a/include/asm-x86/atomic_64.h +++ b/include/asm-x86/atomic_64.h @@ -431,6 +431,32 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u) return c != (u); } +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} + #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) /* These are x86-specific, used by some header files */ -- cgit v1.1 From b194b120507276b4f09e2e14f941884e777fc7c8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman <cpw@sgi.com> Date: Thu, 12 Jun 2008 08:23:48 -0500 Subject: SGI UV: TLB shootdown using broadcast assist unit, cleanups TLB shootdown for SGI UV. v1: 6/2 original v2: 6/3 corrections/improvements per Ingo's review v3: 6/4 split atomic operations off to a separate patch (Jeremy's review) v4: 6/12 include <mach_apic.h> rather than <asm/mach-bigsmp/mach_apic.h> (fixes a !SMP build problem that Ingo found) fix the index on uv_table_bases[blade] Signed-off-by: Cliff Wickman <cpw@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_64.c | 2 +- arch/x86/kernel/tlb_uv.c | 704 ++++++++++++++++++++++++-------------------- include/asm-x86/uv/uv_bau.h | 147 ++++----- 3 files changed, 454 insertions(+), 399 deletions(-) diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index fc13211..5039d0f 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -165,7 +165,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, cpumask_t cpumask = *cpumaskp; if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) - return; + return; /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 28e7c68..f7bc6a6 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -10,18 +10,20 @@ #include <linux/proc_fs.h> #include <linux/kernel.h> -#include <asm/mach-bigsmp/mach_apic.h> #include <asm/mmu_context.h> #include <asm/idle.h> #include <asm/genapic.h> #include <asm/uv/uv_hub.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_bau.h> +#include <asm/tsc.h> -struct bau_control **uv_bau_table_bases; -static int uv_bau_retry_limit; -static int uv_nshift; /* position of pnode (which is nasid>>1) */ -static unsigned long uv_mmask; +#include <mach_apic.h> + +static struct bau_control **uv_bau_table_bases __read_mostly; +static int uv_bau_retry_limit __read_mostly; +static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */ +static unsigned long uv_mmask __read_mostly; char *status_table[] = { "IDLE", @@ -41,19 +43,18 @@ DEFINE_PER_CPU(struct bau_control, bau_control); * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void -uv_reply_to_message(int resource, +static void uv_reply_to_message(int resource, struct bau_payload_queue_entry *msg, struct bau_msg_status *msp) { - int fw; + unsigned long dw; - fw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); msg->replied_to = 1; msg->sw_ack_vector = 0; if (msp) msp->seen_by.bits = 0; - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, fw); + uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); return; } @@ -61,8 +62,7 @@ uv_reply_to_message(int resource, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void -uv_bau_process_message(struct bau_payload_queue_entry *msg, +static void uv_bau_process_message(struct bau_payload_queue_entry *msg, int msg_slot, int sw_ack_slot) { int cpu; @@ -103,8 +103,7 @@ uv_bau_process_message(struct bau_payload_queue_entry *msg, * * Returns the number of cpu's that have not responded. */ -static int -uv_examine_destinations(struct bau_target_nodemask *distribution) +static int uv_examine_destinations(struct bau_target_nodemask *distribution) { int sender; int i; @@ -118,34 +117,161 @@ uv_examine_destinations(struct bau_target_nodemask *distribution) sender = smp_processor_id(); for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE); i++) { - if (bau_node_isset(i, distribution)) { - bau_tablesp = uv_bau_table_bases[i]; - for (msg = bau_tablesp->va_queue_first, j = 0; - j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) { - if ((msg->sending_cpu == sender) && - (!msg->replied_to)) { - msp = bau_tablesp->msg_statuses + j; - printk(KERN_DEBUG + if (!bau_node_isset(i, distribution)) + continue; + bau_tablesp = uv_bau_table_bases[i]; + for (msg = bau_tablesp->va_queue_first, j = 0; + j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) { + if ((msg->sending_cpu == sender) && + (!msg->replied_to)) { + msp = bau_tablesp->msg_statuses + j; + printk(KERN_DEBUG "blade %d: address:%#lx %d of %d, not cpu(s): ", - i, msg->address, - msg->acknowledge_count, - msg->number_of_cpus); - for (k = 0; k < msg->number_of_cpus; - k++) { - if (!((long)1 << k & msp-> - seen_by.bits)) { - count++; - printk("%d ", k); - } + i, msg->address, + msg->acknowledge_count, + msg->number_of_cpus); + for (k = 0; k < msg->number_of_cpus; + k++) { + if (!((long)1 << k & msp-> + seen_by.bits)) { + count++; + printk("%d ", k); } - printk("\n"); } + printk("\n"); } } } return count; } +/* + * wait for completion of a broadcast message + * + * return COMPLETE, RETRY or GIVEUP + */ +static int uv_wait_completion(struct bau_activation_descriptor *bau_desc, + unsigned long mmr_offset, int right_shift) +{ + int exams = 0; + long destination_timeouts = 0; + long source_timeouts = 0; + unsigned long descriptor_status; + + while ((descriptor_status = (((unsigned long) + uv_read_local_mmr(mmr_offset) >> + right_shift) & UV_ACT_STATUS_MASK)) != + DESC_STATUS_IDLE) { + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + source_timeouts++; + if (source_timeouts > SOURCE_TIMEOUT_LIMIT) + source_timeouts = 0; + __get_cpu_var(ptcstats).s_retry++; + return FLUSH_RETRY; + } + /* + * spin here looking for progress at the destinations + */ + if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { + destination_timeouts++; + if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { + /* + * returns number of cpus not responding + */ + if (uv_examine_destinations + (&bau_desc->distribution) == 0) { + __get_cpu_var(ptcstats).d_retry++; + return FLUSH_RETRY; + } + exams++; + if (exams >= uv_bau_retry_limit) { + printk(KERN_DEBUG + "uv_flush_tlb_others"); + printk("giving up on cpu %d\n", + smp_processor_id()); + return FLUSH_GIVEUP; + } + /* + * delays can hang the simulator + udelay(1000); + */ + destination_timeouts = 0; + } + } + } + return FLUSH_COMPLETE; +} + +/** + * uv_flush_send_and_wait + * + * Send a broadcast and wait for a broadcast message to complete. + * + * The cpumaskp mask contains the cpus the broadcast was sent to. + * + * Returns 1 if all remote flushing was done. The mask is zeroed. + * Returns 0 if some remote flushing remains to be done. The mask is left + * unchanged. + */ +int uv_flush_send_and_wait(int cpu, int this_blade, + struct bau_activation_descriptor *bau_desc, cpumask_t *cpumaskp) +{ + int completion_status = 0; + int right_shift; + int bit; + int blade; + int tries = 0; + unsigned long index; + unsigned long mmr_offset; + cycles_t time1; + cycles_t time2; + + if (cpu < UV_CPUS_PER_ACT_STATUS) { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + } else { + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + right_shift = + ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); + } + time1 = get_cycles(); + do { + tries++; + index = ((unsigned long) + 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu; + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + completion_status = uv_wait_completion(bau_desc, mmr_offset, + right_shift); + } while (completion_status == FLUSH_RETRY); + time2 = get_cycles(); + __get_cpu_var(ptcstats).sflush += (time2 - time1); + if (tries > 1) + __get_cpu_var(ptcstats).retriesok++; + + if (completion_status == FLUSH_GIVEUP) { + /* + * Cause the caller to do an IPI-style TLB shootdown on + * the cpu's, all of which are still in the mask. + */ + __get_cpu_var(ptcstats).ptc_i++; + return 0; + } + + /* + * Success, so clear the remote cpu's from the mask so we don't + * use the IPI method of shootdown on them. + */ + for_each_cpu_mask(bit, *cpumaskp) { + blade = uv_cpu_to_blade_id(bit); + if (blade == this_blade) + continue; + cpu_clear(bit, *cpumaskp); + } + if (!cpus_empty(*cpumaskp)) + return 0; + return 1; +} + /** * uv_flush_tlb_others - globally purge translation cache of a virtual * address or all TLB's @@ -164,30 +290,25 @@ uv_examine_destinations(struct bau_target_nodemask *distribution) * * The cpumaskp is converted into a nodemask of the nodes containing * the cpus. + * + * Returns 1 if all remote flushing was done. + * Returns 0 if some remote flushing remains to be done. */ -int -uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va) +int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { int i; + int bit; int blade; int cpu; - int bit; - int right_shift; int this_blade; - int exams = 0; - int tries = 0; - long source_timeouts = 0; - long destination_timeouts = 0; - unsigned long index; - unsigned long mmr_offset; - unsigned long descriptor_status; + int locals = 0; struct bau_activation_descriptor *bau_desc; - ktime_t time1, time2; cpu = uv_blade_processor_id(); this_blade = uv_numa_blade_id(); bau_desc = __get_cpu_var(bau_control).descriptor_base; - bau_desc += (UV_ITEMS_PER_DESCRIPTOR * cpu); + bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); @@ -196,96 +317,29 @@ uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, unsigned long va) blade = uv_cpu_to_blade_id(bit); if (blade > (UV_DISTRIBUTION_SIZE - 1)) BUG(); - if (blade == this_blade) + if (blade == this_blade) { + locals++; continue; + } bau_node_set(blade, &bau_desc->distribution); - /* leave the bits for the remote cpu's in the mask until - success; on failure we fall back to the IPI method */ i++; } - if (i == 0) - goto none_to_flush; + if (i == 0) { + /* + * no off_node flushing; return status for local node + */ + if (locals) + return 0; + else + return 1; + } __get_cpu_var(ptcstats).requestor++; __get_cpu_var(ptcstats).ntargeted += i; bau_desc->payload.address = va; bau_desc->payload.sending_cpu = smp_processor_id(); - if (cpu < UV_CPUS_PER_ACT_STATUS) { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - } else { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = - ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); - } - time1 = ktime_get(); - -retry: - tries++; - index = ((unsigned long) - 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu; - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - - while ((descriptor_status = (((unsigned long) - uv_read_local_mmr(mmr_offset) >> - right_shift) & UV_ACT_STATUS_MASK)) != - DESC_STATUS_IDLE) { - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - source_timeouts++; - if (source_timeouts > SOURCE_TIMEOUT_LIMIT) - source_timeouts = 0; - __get_cpu_var(ptcstats).s_retry++; - goto retry; - } - /* spin here looking for progress at the destinations */ - if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { - destination_timeouts++; - if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { - /* returns # of cpus not responding */ - if (uv_examine_destinations - (&bau_desc->distribution) == 0) { - __get_cpu_var(ptcstats).d_retry++; - goto retry; - } - exams++; - if (exams >= uv_bau_retry_limit) { - printk(KERN_DEBUG - "uv_flush_tlb_others"); - printk("giving up on cpu %d\n", - smp_processor_id()); - goto unsuccessful; - } - /* delays can hang up the simulator - udelay(1000); - */ - destination_timeouts = 0; - } - } - } - if (tries > 1) - __get_cpu_var(ptcstats).retriesok++; - /* on success, clear the remote cpu's from the mask so we don't - use the IPI method of shootdown on them */ - for_each_cpu_mask(bit, *cpumaskp) { - blade = uv_cpu_to_blade_id(bit); - if (blade == this_blade) - continue; - cpu_clear(bit, *cpumaskp); - } - -unsuccessful: - time2 = ktime_get(); - __get_cpu_var(ptcstats).sflush_ns += (time2.tv64 - time1.tv64); - -none_to_flush: - if (cpus_empty(*cpumaskp)) - return 1; - - /* Cause the caller to do an IPI-style TLB shootdown on - the cpu's still in the mask */ - __get_cpu_var(ptcstats).ptc_i++; - return 0; + return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); } /* @@ -302,13 +356,12 @@ none_to_flush: * (the resource will not be freed until noninterruptable cpus see this * interrupt; hardware will timeout the s/w ack and reply ERROR) */ -void -uv_bau_message_interrupt(struct pt_regs *regs) +void uv_bau_message_interrupt(struct pt_regs *regs) { struct bau_payload_queue_entry *pqp; struct bau_payload_queue_entry *msg; struct pt_regs *old_regs = set_irq_regs(regs); - ktime_t time1, time2; + cycles_t time1, time2; int msg_slot; int sw_ack_slot; int fw; @@ -319,7 +372,7 @@ uv_bau_message_interrupt(struct pt_regs *regs) exit_idle(); irq_enter(); - time1 = ktime_get(); + time1 = get_cycles(); local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); @@ -343,16 +396,15 @@ uv_bau_message_interrupt(struct pt_regs *regs) else if (count > 1) __get_cpu_var(ptcstats).multmsg++; - time2 = ktime_get(); - __get_cpu_var(ptcstats).dflush_ns += (time2.tv64 - time1.tv64); + time2 = get_cycles(); + __get_cpu_var(ptcstats).dflush += (time2 - time1); irq_exit(); set_irq_regs(old_regs); return; } -static void -uv_enable_timeouts(void) +static void uv_enable_timeouts(void) { int i; int blade; @@ -361,7 +413,6 @@ uv_enable_timeouts(void) int cur_cpu = 0; unsigned long apicid; - /* better if we had each_online_blade */ last_blade = -1; for_each_online_node(i) { blade = uv_node_to_blade_id(i); @@ -375,16 +426,14 @@ uv_enable_timeouts(void) return; } -static void * -uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) { if (*offset < num_possible_cpus()) return offset; return NULL; } -static void * -uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; if (*offset < num_possible_cpus()) @@ -392,8 +441,7 @@ uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) return NULL; } -static void -uv_ptc_seq_stop(struct seq_file *file, void *data) +static void uv_ptc_seq_stop(struct seq_file *file, void *data) { } @@ -401,8 +449,7 @@ uv_ptc_seq_stop(struct seq_file *file, void *data) * Display the statistics thru /proc * data points to the cpu number */ -static int -uv_ptc_seq_show(struct seq_file *file, void *data) +static int uv_ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; int cpu; @@ -413,7 +460,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "# cpu requestor requestee one all sretry dretry ptc_i "); seq_printf(file, - "sw_ack sflush_us dflush_us sok dnomsg dmult starget\n"); + "sw_ack sflush dflush sok dnomsg dmult starget\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -425,7 +472,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data) uv_read_global_mmr64(uv_blade_to_pnode (uv_cpu_to_blade_id(cpu)), UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), - stat->sflush_ns / 1000, stat->dflush_ns / 1000, + stat->sflush, stat->dflush, stat->retriesok, stat->nomsg, stat->multmsg, stat->ntargeted); } @@ -437,8 +484,7 @@ uv_ptc_seq_show(struct seq_file *file, void *data) * 0: display meaning of the statistics * >0: retry limit */ -static ssize_t -uv_ptc_proc_write(struct file *file, const char __user *user, +static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data) { long newmode; @@ -471,9 +517,9 @@ uv_ptc_proc_write(struct file *file, const char __user *user, printk(KERN_DEBUG "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); printk(KERN_DEBUG - "sflush_us: microseconds spent in uv_flush_tlb_others()\n"); + "sflush_us: cycles spent in uv_flush_tlb_others()\n"); printk(KERN_DEBUG - "dflush_us: microseconds spent in handling flush requests\n"); + "dflush_us: cycles spent in handling flush requests\n"); printk(KERN_DEBUG "sok: successes on retry\n"); printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); printk(KERN_DEBUG @@ -489,40 +535,33 @@ uv_ptc_proc_write(struct file *file, const char __user *user, } static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show + .start = uv_ptc_seq_start, + .next = uv_ptc_seq_next, + .stop = uv_ptc_seq_stop, + .show = uv_ptc_seq_show }; -static int -uv_ptc_proc_open(struct inode *inode, struct file *file) +static int uv_ptc_proc_open(struct inode *inode, struct file *file) { return seq_open(file, &uv_ptc_seq_ops); } static const struct file_operations proc_uv_ptc_operations = { - .open = uv_ptc_proc_open, - .read = seq_read, - .write = uv_ptc_proc_write, - .llseek = seq_lseek, - .release = seq_release, + .open = uv_ptc_proc_open, + .read = seq_read, + .write = uv_ptc_proc_write, + .llseek = seq_lseek, + .release = seq_release, }; -static struct proc_dir_entry *proc_uv_ptc; - -static int __init -uv_ptc_init(void) +static int __init uv_ptc_init(void) { - static struct proc_dir_entry *sgi_proc_dir; - - sgi_proc_dir = NULL; + struct proc_dir_entry *proc_uv_ptc; if (!is_uv_system()) return 0; - sgi_proc_dir = proc_mkdir("sgi_uv", NULL); - if (!sgi_proc_dir) + if (!proc_mkdir("sgi_uv", NULL)) return -EINVAL; proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); @@ -535,202 +574,213 @@ uv_ptc_init(void) return 0; } -static void __exit -uv_ptc_exit(void) +/* + * begin the initialization of the per-blade control structures + */ +static struct bau_control * __init uv_table_bases_init(int blade, int node) { - remove_proc_entry(UV_PTC_BASENAME, NULL); + int i; + int *ip; + struct bau_msg_status *msp; + struct bau_control *bau_tablesp; + + bau_tablesp = + kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); + if (!bau_tablesp) + BUG(); + bau_tablesp->msg_statuses = + kmalloc_node(sizeof(struct bau_msg_status) * + DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, node); + if (!bau_tablesp->msg_statuses) + BUG(); + for (i = 0, msp = bau_tablesp->msg_statuses; + i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, msp++) { + bau_cpubits_clear(&msp->seen_by, (int) + uv_blade_nr_possible_cpus(blade)); + } + bau_tablesp->watching = + kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES, + GFP_KERNEL, node); + if (!bau_tablesp->watching) + BUG(); + for (i = 0, ip = bau_tablesp->watching; + i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, ip++) { + *ip = 0; + } + uv_bau_table_bases[blade] = bau_tablesp; + return bau_tablesp; } -module_init(uv_ptc_init); -module_exit(uv_ptc_exit); +/* + * finish the initialization of the per-blade control structures + */ +static void __init uv_table_bases_finish(int blade, int node, int cur_cpu, + struct bau_control *bau_tablesp, + struct bau_activation_descriptor *adp) +{ + int i; + struct bau_control *bcp; + + for (i = cur_cpu; i < (cur_cpu + uv_blade_nr_possible_cpus(blade)); + i++) { + bcp = (struct bau_control *)&per_cpu(bau_control, i); + bcp->bau_msg_head = bau_tablesp->va_queue_first; + bcp->va_queue_first = bau_tablesp->va_queue_first; + bcp->va_queue_last = bau_tablesp->va_queue_last; + bcp->watching = bau_tablesp->watching; + bcp->msg_statuses = bau_tablesp->msg_statuses; + bcp->descriptor_base = adp; + } +} /* - * Initialization of BAU-related structures + * initialize the sending side's sending buffers */ -int __init -uv_bau_init(void) +static struct bau_activation_descriptor * __init +uv_activation_descriptor_init(int node, int pnode) { int i; - int j; - int blade; - int nblades; - int *ip; - int pnode; - int last_blade; - int cur_cpu = 0; unsigned long pa; - unsigned long n; unsigned long m; + unsigned long n; unsigned long mmr_image; - unsigned long apicid; + struct bau_activation_descriptor *adp; + struct bau_activation_descriptor *ad2; + + adp = (struct bau_activation_descriptor *) + kmalloc_node(16384, GFP_KERNEL, node); + if (!adp) + BUG(); + pa = __pa((unsigned long)adp); + n = pa >> uv_nshift; + m = pa & uv_mmask; + mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); + if (mmr_image) + uv_write_global_mmr64(pnode, (unsigned long) + UVH_LB_BAU_SB_DESCRIPTOR_BASE, + (n << UV_DESC_BASE_PNODE_SHIFT | m)); + for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + memset(ad2, 0, sizeof(struct bau_activation_descriptor)); + ad2->header.sw_ack_flag = 1; + ad2->header.base_dest_nodeid = + uv_blade_to_pnode(uv_cpu_to_blade_id(0)); + ad2->header.command = UV_NET_ENDPOINT_INTD; + ad2->header.int_both = 1; + /* + * all others need to be set to zero: + * fairness chaining multilevel count replied_to + */ + } + return adp; +} + +/* + * initialize the destination side's receiving buffers + */ +static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node, + int pnode, struct bau_control *bau_tablesp) +{ char *cp; - struct bau_control *bau_tablesp; - struct bau_activation_descriptor *adp, *ad2; struct bau_payload_queue_entry *pqp; - struct bau_msg_status *msp; - struct bau_control *bcp; - if (!is_uv_system()) - return 0; + pqp = (struct bau_payload_queue_entry *) + kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) * + sizeof(struct bau_payload_queue_entry), + GFP_KERNEL, node); + if (!pqp) + BUG(); + cp = (char *)pqp + 31; + pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); + bau_tablesp->va_queue_first = pqp; + uv_write_global_mmr64(pnode, + UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, + ((unsigned long)pnode << + UV_PAYLOADQ_PNODE_SHIFT) | + uv_physnodeaddr(pqp)); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, + uv_physnodeaddr(pqp)); + bau_tablesp->va_queue_last = + pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1); + uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, + (unsigned long) + uv_physnodeaddr(bau_tablesp->va_queue_last)); + memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * + DESTINATION_PAYLOAD_QUEUE_SIZE); + return pqp; +} - uv_bau_retry_limit = 1; +/* + * Initialization of each UV blade's structures + */ +static int __init uv_init_blade(int blade, int node, int cur_cpu) +{ + int pnode; + unsigned long pa; + unsigned long apicid; + struct bau_activation_descriptor *adp; + struct bau_payload_queue_entry *pqp; + struct bau_control *bau_tablesp; - if ((sizeof(struct bau_local_cpumask) * BITSPERBYTE) < - MAX_CPUS_PER_NODE) { - printk(KERN_ERR - "uv_bau_init: bau_local_cpumask.bits too small\n"); - BUG(); + bau_tablesp = uv_table_bases_init(blade, node); + pnode = uv_blade_to_pnode(blade); + adp = uv_activation_descriptor_init(node, pnode); + pqp = uv_payload_queue_init(node, pnode, bau_tablesp); + uv_table_bases_finish(blade, node, cur_cpu, bau_tablesp, adp); + /* + * the below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS + */ + apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); + pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); + if ((pa & 0xff) != UV_BAU_MESSAGE) { + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + ((apicid << 32) | UV_BAU_MESSAGE)); } + return 0; +} + +/* + * Initialization of BAU-related structures + */ +static int __init uv_bau_init(void) +{ + int blade; + int node; + int nblades; + int last_blade; + int cur_cpu = 0; + + if (!is_uv_system()) + return 0; + uv_bau_retry_limit = 1; uv_nshift = uv_hub_info->n_val; uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1; nblades = 0; last_blade = -1; - for_each_online_node(i) { - blade = uv_node_to_blade_id(i); + for_each_online_node(node) { + blade = uv_node_to_blade_id(node); if (blade == last_blade) continue; last_blade = blade; nblades++; } - uv_bau_table_bases = (struct bau_control **) kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); if (!uv_bau_table_bases) BUG(); - - /* better if we had each_online_blade */ last_blade = -1; - for_each_online_node(i) { - blade = uv_node_to_blade_id(i); + for_each_online_node(node) { + blade = uv_node_to_blade_id(node); if (blade == last_blade) continue; last_blade = blade; - - bau_tablesp = - kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, i); - if (!bau_tablesp) - BUG(); - - bau_tablesp->msg_statuses = - kmalloc_node(sizeof(struct bau_msg_status) * - DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, i); - if (!bau_tablesp->msg_statuses) - BUG(); - for (j = 0, msp = bau_tablesp->msg_statuses; - j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, msp++) { - bau_cpubits_clear(&msp->seen_by, (int) - uv_blade_nr_possible_cpus(blade)); - } - - bau_tablesp->watching = - kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES, - GFP_KERNEL, i); - if (!bau_tablesp->watching) - BUG(); - for (j = 0, ip = bau_tablesp->watching; - j < DESTINATION_PAYLOAD_QUEUE_SIZE; j++, ip++) { - *ip = 0; - } - - uv_bau_table_bases[i] = bau_tablesp; - - pnode = uv_blade_to_pnode(blade); - - if (sizeof(struct bau_activation_descriptor) != 64) - BUG(); - - adp = (struct bau_activation_descriptor *) - kmalloc_node(16384, GFP_KERNEL, i); - if (!adp) - BUG(); - if ((unsigned long)adp & 0xfff) - BUG(); - pa = __pa((unsigned long)adp); - n = pa >> uv_nshift; - m = pa & uv_mmask; - - mmr_image = uv_read_global_mmr64(pnode, - UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) - uv_write_global_mmr64(pnode, (unsigned long) - UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | - m)); - for (j = 0, ad2 = adp; j < UV_ACTIVATION_DESCRIPTOR_SIZE; - j++, ad2++) { - memset(ad2, 0, - sizeof(struct bau_activation_descriptor)); - ad2->header.sw_ack_flag = 1; - ad2->header.base_dest_nodeid = - uv_blade_to_pnode(uv_cpu_to_blade_id(0)); - ad2->header.command = UV_NET_ENDPOINT_INTD; - ad2->header.int_both = 1; - /* all others need to be set to zero: - fairness chaining multilevel count replied_to */ - } - - pqp = (struct bau_payload_queue_entry *) - kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) * - sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, i); - if (!pqp) - BUG(); - if (sizeof(struct bau_payload_queue_entry) != 32) - BUG(); - if ((unsigned long)(&((struct bau_payload_queue_entry *)0)-> - sw_ack_vector) != 15) - BUG(); - - cp = (char *)pqp + 31; - pqp = (struct bau_payload_queue_entry *) - (((unsigned long)cp >> 5) << 5); - bau_tablesp->va_queue_first = pqp; - uv_write_global_mmr64(pnode, - UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, - ((unsigned long)pnode << - UV_PAYLOADQ_PNODE_SHIFT) | - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, - uv_physnodeaddr(pqp)); - bau_tablesp->va_queue_last = - pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, - (unsigned long) - uv_physnodeaddr(bau_tablesp-> - va_queue_last)); - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * - DESTINATION_PAYLOAD_QUEUE_SIZE); - - /* this initialization can't be in firmware because the - messaging IRQ will be determined by the OS */ - apicid = per_cpu(x86_cpu_to_apicid, cur_cpu); - pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | - UV_BAU_MESSAGE)); - } - - for (j = cur_cpu; j < (cur_cpu + uv_blade_nr_possible_cpus(i)); - j++) { - bcp = (struct bau_control *)&per_cpu(bau_control, j); - bcp->bau_msg_head = bau_tablesp->va_queue_first; - bcp->va_queue_first = bau_tablesp->va_queue_first; - - bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->watching = bau_tablesp->watching; - bcp->msg_statuses = bau_tablesp->msg_statuses; - bcp->descriptor_base = adp; - } - cur_cpu += uv_blade_nr_possible_cpus(i); + uv_init_blade(blade, node, cur_cpu); + cur_cpu += uv_blade_nr_possible_cpus(blade); } - set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); - uv_enable_timeouts(); - return 0; } - __initcall(uv_bau_init); +__initcall(uv_ptc_init); diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h index f125f86..e52fec8 100644 --- a/include/asm-x86/uv/uv_bau.h +++ b/include/asm-x86/uv/uv_bau.h @@ -14,9 +14,9 @@ #include <linux/bitmap.h> #define BITSPERBYTE 8 -/* Broadcast Assist Unit messaging structures */ - /* + * Broadcast Assist Unit messaging structures + * * Selective Broadcast activations are induced by software action * specifying a particular 8-descriptor "set" via a 6-bit index written * to an MMR. @@ -33,54 +33,73 @@ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ -#define UV_ITEMS_PER_DESCRIPTOR 8 -#define UV_CPUS_PER_ACT_STATUS 32 -#define UV_ACT_STATUS_MASK 0x3 -#define UV_ACT_STATUS_SIZE 2 -#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 -#define UV_DISTRIBUTION_SIZE 256 -#define UV_SW_ACK_NPENDING 8 -#define UV_BAU_MESSAGE 200 /* Messaging irq; see irq_64.h */ - /* and include/asm-x86/hw_irq_64.h */ - /* To be dynamically allocated in the future */ -#define UV_NET_ENDPOINT_INTD 0x38 -#define UV_DESC_BASE_PNODE_SHIFT 49 /* position of pnode (nasid>>1) in MMR */ -#define UV_PAYLOADQ_PNODE_SHIFT 49 - -#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" -#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) - -/* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ +#define UV_ITEMS_PER_DESCRIPTOR 8 +#define UV_CPUS_PER_ACT_STATUS 32 +#define UV_ACT_STATUS_MASK 0x3 +#define UV_ACT_STATUS_SIZE 2 +#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_DISTRIBUTION_SIZE 256 +#define UV_SW_ACK_NPENDING 8 +#define UV_BAU_MESSAGE 200 +/* + * Messaging irq; see irq_64.h and include/asm-x86/hw_irq_64.h + * To be dynamically allocated in the future + */ +#define UV_NET_ENDPOINT_INTD 0x38 +#define UV_DESC_BASE_PNODE_SHIFT 49 +#define UV_PAYLOADQ_PNODE_SHIFT 49 +#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) + +/* + * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 + */ #define DESC_STATUS_IDLE 0 #define DESC_STATUS_ACTIVE 1 #define DESC_STATUS_DESTINATION_TIMEOUT 2 #define DESC_STATUS_SOURCE_TIMEOUT 3 -/* source side threshholds at which message retries print a warning */ +/* + * source side threshholds at which message retries print a warning + */ #define SOURCE_TIMEOUT_LIMIT 20 #define DESTINATION_TIMEOUT_LIMIT 20 -/* number of entries in the destination side payload queue */ +/* + * number of entries in the destination side payload queue + */ #define DESTINATION_PAYLOAD_QUEUE_SIZE 17 -/* number of destination side software ack resources */ +/* + * number of destination side software ack resources + */ #define DESTINATION_NUM_RESOURCES 8 #define MAX_CPUS_PER_NODE 32 +/* + * completion statuses for sending a TLB flush message + */ +#define FLUSH_RETRY 1 +#define FLUSH_GIVEUP 2 +#define FLUSH_COMPLETE 3 -/* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) */ -/* If the 'multilevel' flag in the header portion of the descriptor +/* + * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) + * If the 'multilevel' flag in the header portion of the descriptor * has been set to 0, then endpoint multi-unicast mode is selected. * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit * 'base_dest_nodeid' field of the header corresponds to the - * destination nodeID associated with that specified bit. */ + * destination nodeID associated with that specified bit. + */ struct bau_target_nodemask { unsigned long bits[BITS_TO_LONGS(256)]; }; -/* mask of cpu's on a node */ -/* (during initialization we need to check that unsigned long has - enough bits for max. cpu's per node) */ +/* + * mask of cpu's on a node + * (during initialization we need to check that unsigned long has + * enough bits for max. cpu's per node) + */ struct bau_local_cpumask { unsigned long bits; }; @@ -99,7 +118,9 @@ struct bau_local_cpumask { * the s/w ack bit vector ] */ -/* The payload is software-defined for INTD transactions */ +/* + * The payload is software-defined for INTD transactions + */ struct bau_msg_payload { unsigned long address; /* signifies a page or all TLB's of the cpu */ @@ -112,8 +133,10 @@ struct bau_msg_payload { }; -/* Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) */ -/* see table 4.2.3.0.1 in broacast_assist spec. */ +/* + * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see table 4.2.3.0.1 in broacast_assist spec. + */ struct bau_msg_header { int dest_subnodeid:6; /* must be zero */ /* bits 5:0 */ @@ -173,11 +196,15 @@ struct bau_msg_header { /* bits 127:107 */ }; -/* The format of the message to send, plus all accompanying control */ -/* Should be 64 bytes */ +/* + * The format of the message to send, plus all accompanying control + * Should be 64 bytes + */ struct bau_activation_descriptor { struct bau_target_nodemask distribution; - /* message template, consisting of header and payload: */ + /* + * message template, consisting of header and payload: + */ struct bau_msg_header header; struct bau_msg_payload payload; }; @@ -235,18 +262,24 @@ struct bau_payload_queue_entry { /* bytes 24-31 */ }; -/* one for every slot in the destination payload queue */ +/* + * one for every slot in the destination payload queue + */ struct bau_msg_status { struct bau_local_cpumask seen_by; /* map of cpu's */ }; -/* one for every slot in the destination software ack resources */ +/* + * one for every slot in the destination software ack resources + */ struct bau_sw_ack_status { struct bau_payload_queue_entry *msg; /* associated message */ int watcher; /* cpu monitoring, or -1 */ }; -/* one on every node and per-cpu; to locate the software tables */ +/* + * one on every node and per-cpu; to locate the software tables + */ struct bau_control { struct bau_activation_descriptor *descriptor_base; struct bau_payload_queue_entry *bau_msg_head; @@ -267,8 +300,8 @@ struct ptc_stats { unsigned long onetlb; /* times just one tlb on this cpu was flushed */ unsigned long s_retry; /* retries on source side timeouts */ unsigned long d_retry; /* retries on destination side timeouts */ - unsigned long sflush_ns;/* nanoseconds spent in uv_flush_tlb_others */ - unsigned long dflush_ns;/* nanoseconds spent destination side */ + unsigned long sflush; /* cycles spent in uv_flush_tlb_others */ + unsigned long dflush; /* cycles spent on destination side */ unsigned long retriesok; /* successes on retries */ unsigned long nomsg; /* interrupts with no message */ unsigned long multmsg; /* interrupts with multiple messages */ @@ -293,39 +326,11 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) bitmap_zero(&dstp->bits, nbits); } -/* - * atomic increment of a short integer - * (rather than using the __sync_add_and_fetch() intrinsic) - * - * returns the new value of the variable - */ -static inline short int atomic_inc_short(short int *v) -{ - asm volatile("movw $1, %%cx\n" - "lock ; xaddw %%cx, %0\n" - : "+m" (*v) /* outputs */ - : : "%cx", "memory"); /* inputs : clobbereds */ - return *v; -} - -/* - * atomic OR of two long integers - * (rather than using the __sync_or_and_fetch() intrinsic) - */ -static inline void atomic_or_long(unsigned long *v1, unsigned long v2) -{ - asm volatile("movq %0, %%rax; lea %1, %%rdx\n" - "lock ; orq %%rax, %%rdx\n" - : "+m" (*v1) /* outputs */ - : "m" (v1), "m" (v2) /* inputs */ - : "memory"); /* clobbereds */ -} - #define cpubit_isset(cpu, bau_local_cpumask) \ test_bit((cpu), (bau_local_cpumask).bits) -int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); -void uv_bau_message_intr1(void); -void uv_bau_timeout_intr1(void); +extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); +extern void uv_bau_message_intr1(void); +extern void uv_bau_timeout_intr1(void); #endif /* __ASM_X86_UV_BAU__ */ -- cgit v1.1 From dc163a41ffba22a6ef70b51e7ddf68aa13b4b414 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 18 Jun 2008 14:15:43 +0200 Subject: SGI UV: TLB shootdown using broadcast assist unit TLB shootdown for SGI UV. v5: 6/12 corrections/improvements per Ingo's second review Signed-off-by: Cliff Wickman <cpw@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_uv.c | 183 ++++++++++++++++++++------------------------ include/asm-x86/uv/uv_bau.h | 21 ++--- 2 files changed, 96 insertions(+), 108 deletions(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f7bc6a6..d8705e9 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -25,15 +25,8 @@ static int uv_bau_retry_limit __read_mostly; static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */ static unsigned long uv_mmask __read_mostly; -char *status_table[] = { - "IDLE", - "ACTIVE", - "DESTINATION TIMEOUT", - "SOURCE TIMEOUT" -}; - -DEFINE_PER_CPU(struct ptc_stats, ptcstats); -DEFINE_PER_CPU(struct bau_control, bau_control); +static DEFINE_PER_CPU(struct ptc_stats, ptcstats); +static DEFINE_PER_CPU(struct bau_control, bau_control); /* * Free a software acknowledge hardware resource by clearing its Pending @@ -55,7 +48,6 @@ static void uv_reply_to_message(int resource, if (msp) msp->seen_by.bits = 0; uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); - return; } /* @@ -73,7 +65,7 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg, cpu = uv_blade_processor_id(); msg->number_of_cpus = uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); - this_cpu_mask = (unsigned long)1 << cpu; + this_cpu_mask = 1UL << cpu; if (msp->seen_by.bits & this_cpu_mask) return; atomic_or_long(&msp->seen_by.bits, this_cpu_mask); @@ -94,53 +86,60 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg, atomic_inc_short(&msg->acknowledge_count); if (msg->number_of_cpus == msg->acknowledge_count) uv_reply_to_message(sw_ack_slot, msg, msp); - return; } /* - * Examine the payload queue on all the distribution nodes to see + * Examine the payload queue on one distribution node to see * which messages have not been seen, and which cpu(s) have not seen them. * * Returns the number of cpu's that have not responded. */ -static int uv_examine_destinations(struct bau_target_nodemask *distribution) +static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) { - int sender; int i; int j; - int k; int count = 0; - struct bau_control *bau_tablesp; struct bau_payload_queue_entry *msg; struct bau_msg_status *msp; + for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; + msg++, i++) { + if ((msg->sending_cpu == sender) && (!msg->replied_to)) { + msp = bau_tablesp->msg_statuses + i; + printk(KERN_DEBUG + "blade %d: address:%#lx %d of %d, not cpu(s): ", + i, msg->address, msg->acknowledge_count, + msg->number_of_cpus); + for (j = 0; j < msg->number_of_cpus; j++) { + if (!((long)1 << j & msp-> seen_by.bits)) { + count++; + printk("%d ", j); + } + } + printk("\n"); + } + } + return count; +} + +/* + * Examine the payload queue on all the distribution nodes to see + * which messages have not been seen, and which cpu(s) have not seen them. + * + * Returns the number of cpu's that have not responded. + */ +static int uv_examine_destinations(struct bau_target_nodemask *distribution) +{ + int sender; + int i; + int count = 0; + sender = smp_processor_id(); for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE); i++) { if (!bau_node_isset(i, distribution)) continue; - bau_tablesp = uv_bau_table_bases[i]; - for (msg = bau_tablesp->va_queue_first, j = 0; - j < DESTINATION_PAYLOAD_QUEUE_SIZE; msg++, j++) { - if ((msg->sending_cpu == sender) && - (!msg->replied_to)) { - msp = bau_tablesp->msg_statuses + j; - printk(KERN_DEBUG - "blade %d: address:%#lx %d of %d, not cpu(s): ", - i, msg->address, - msg->acknowledge_count, - msg->number_of_cpus); - for (k = 0; k < msg->number_of_cpus; - k++) { - if (!((long)1 << k & msp-> - seen_by.bits)) { - count++; - printk("%d ", k); - } - } - printk("\n"); - } - } + count += uv_examine_destination(uv_bau_table_bases[i], sender); } return count; } @@ -150,7 +149,7 @@ static int uv_examine_destinations(struct bau_target_nodemask *distribution) * * return COMPLETE, RETRY or GIVEUP */ -static int uv_wait_completion(struct bau_activation_descriptor *bau_desc, +static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift) { int exams = 0; @@ -213,8 +212,8 @@ static int uv_wait_completion(struct bau_activation_descriptor *bau_desc, * Returns 0 if some remote flushing remains to be done. The mask is left * unchanged. */ -int uv_flush_send_and_wait(int cpu, int this_blade, - struct bau_activation_descriptor *bau_desc, cpumask_t *cpumaskp) +int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, + cpumask_t *cpumaskp) { int completion_status = 0; int right_shift; @@ -237,8 +236,8 @@ int uv_flush_send_and_wait(int cpu, int this_blade, time1 = get_cycles(); do { tries++; - index = ((unsigned long) - 1 << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | cpu; + index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | + cpu; uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); completion_status = uv_wait_completion(bau_desc, mmr_offset, right_shift); @@ -303,7 +302,7 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, int cpu; int this_blade; int locals = 0; - struct bau_activation_descriptor *bau_desc; + struct bau_desc *bau_desc; cpu = uv_blade_processor_id(); this_blade = uv_numa_blade_id(); @@ -315,8 +314,7 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, i = 0; for_each_cpu_mask(bit, *cpumaskp) { blade = uv_cpu_to_blade_id(bit); - if (blade > (UV_DISTRIBUTION_SIZE - 1)) - BUG(); + BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); if (blade == this_blade) { locals++; continue; @@ -360,6 +358,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs) { struct bau_payload_queue_entry *pqp; struct bau_payload_queue_entry *msg; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; struct pt_regs *old_regs = set_irq_regs(regs); cycles_t time1, time2; int msg_slot; @@ -376,7 +376,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs) local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); - pqp = __get_cpu_var(bau_control).va_queue_first; + pqp = va_queue_first = __get_cpu_var(bau_control).va_queue_first; + va_queue_last = __get_cpu_var(bau_control).va_queue_last; msg = __get_cpu_var(bau_control).bau_msg_head; while (msg->sw_ack_vector) { count++; @@ -387,8 +388,8 @@ void uv_bau_message_interrupt(struct pt_regs *regs) uv_bau_process_message(msg, msg_slot, sw_ack_slot); msg++; - if (msg > __get_cpu_var(bau_control).va_queue_last) - msg = __get_cpu_var(bau_control).va_queue_first; + if (msg > va_queue_last) + msg = va_queue_first; __get_cpu_var(bau_control).bau_msg_head = msg; } if (!count) @@ -401,7 +402,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs) irq_exit(); set_irq_regs(old_regs); - return; } static void uv_enable_timeouts(void) @@ -423,7 +423,6 @@ static void uv_enable_timeouts(void) pnode = uv_blade_to_pnode(blade); cur_cpu += uv_blade_nr_possible_cpus(i); } - return; } static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) @@ -535,10 +534,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, } static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show + .start = uv_ptc_seq_start, + .next = uv_ptc_seq_next, + .stop = uv_ptc_seq_stop, + .show = uv_ptc_seq_show }; static int uv_ptc_proc_open(struct inode *inode, struct file *file) @@ -568,6 +567,7 @@ static int __init uv_ptc_init(void) if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); + remove_proc_entry("sgi_uv", NULL); return -EINVAL; } proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; @@ -582,33 +582,26 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) int i; int *ip; struct bau_msg_status *msp; - struct bau_control *bau_tablesp; + struct bau_control *bau_tabp; - bau_tablesp = + bau_tabp = kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); - if (!bau_tablesp) - BUG(); - bau_tablesp->msg_statuses = + BUG_ON(!bau_tabp); + bau_tabp->msg_statuses = kmalloc_node(sizeof(struct bau_msg_status) * - DESTINATION_PAYLOAD_QUEUE_SIZE, GFP_KERNEL, node); - if (!bau_tablesp->msg_statuses) - BUG(); - for (i = 0, msp = bau_tablesp->msg_statuses; - i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, msp++) { + DEST_Q_SIZE, GFP_KERNEL, node); + BUG_ON(!bau_tabp->msg_statuses); + for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) bau_cpubits_clear(&msp->seen_by, (int) uv_blade_nr_possible_cpus(blade)); - } - bau_tablesp->watching = - kmalloc_node(sizeof(int) * DESTINATION_NUM_RESOURCES, - GFP_KERNEL, node); - if (!bau_tablesp->watching) - BUG(); - for (i = 0, ip = bau_tablesp->watching; - i < DESTINATION_PAYLOAD_QUEUE_SIZE; i++, ip++) { + bau_tabp->watching = + kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); + BUG_ON(!bau_tabp->watching); + for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) { *ip = 0; } - uv_bau_table_bases[blade] = bau_tablesp; - return bau_tablesp; + uv_bau_table_bases[blade] = bau_tabp; + return bau_tabsp; } /* @@ -616,7 +609,7 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) */ static void __init uv_table_bases_finish(int blade, int node, int cur_cpu, struct bau_control *bau_tablesp, - struct bau_activation_descriptor *adp) + struct bau_desc *adp) { int i; struct bau_control *bcp; @@ -636,7 +629,7 @@ static void __init uv_table_bases_finish(int blade, int node, int cur_cpu, /* * initialize the sending side's sending buffers */ -static struct bau_activation_descriptor * __init +static struct bau_desc * __init uv_activation_descriptor_init(int node, int pnode) { int i; @@ -644,13 +637,12 @@ uv_activation_descriptor_init(int node, int pnode) unsigned long m; unsigned long n; unsigned long mmr_image; - struct bau_activation_descriptor *adp; - struct bau_activation_descriptor *ad2; + struct bau_desc *adp; + struct bau_desc *ad2; - adp = (struct bau_activation_descriptor *) + adp = (struct bau_desc *) kmalloc_node(16384, GFP_KERNEL, node); - if (!adp) - BUG(); + BUG_ON(!adp); pa = __pa((unsigned long)adp); n = pa >> uv_nshift; m = pa & uv_mmask; @@ -660,7 +652,7 @@ uv_activation_descriptor_init(int node, int pnode) UVH_LB_BAU_SB_DESCRIPTOR_BASE, (n << UV_DESC_BASE_PNODE_SHIFT | m)); for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { - memset(ad2, 0, sizeof(struct bau_activation_descriptor)); + memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; ad2->header.base_dest_nodeid = uv_blade_to_pnode(uv_cpu_to_blade_id(0)); @@ -683,12 +675,10 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node, char *cp; struct bau_payload_queue_entry *pqp; - pqp = (struct bau_payload_queue_entry *) - kmalloc_node((DESTINATION_PAYLOAD_QUEUE_SIZE + 1) * - sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); - if (!pqp) - BUG(); + pqp = (struct bau_payload_queue_entry *) kmalloc_node( + (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), + GFP_KERNEL, node); + BUG_ON(!pqp); cp = (char *)pqp + 31; pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); bau_tablesp->va_queue_first = pqp; @@ -699,13 +689,11 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node, uv_physnodeaddr(pqp)); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, uv_physnodeaddr(pqp)); - bau_tablesp->va_queue_last = - pqp + (DESTINATION_PAYLOAD_QUEUE_SIZE - 1); + bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, (unsigned long) uv_physnodeaddr(bau_tablesp->va_queue_last)); - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * - DESTINATION_PAYLOAD_QUEUE_SIZE); + memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); return pqp; } @@ -717,7 +705,7 @@ static int __init uv_init_blade(int blade, int node, int cur_cpu) int pnode; unsigned long pa; unsigned long apicid; - struct bau_activation_descriptor *adp; + struct bau_desc *adp; struct bau_payload_queue_entry *pqp; struct bau_control *bau_tablesp; @@ -755,7 +743,7 @@ static int __init uv_bau_init(void) uv_bau_retry_limit = 1; uv_nshift = uv_hub_info->n_val; - uv_mmask = ((unsigned long)1 << uv_hub_info->n_val) - 1; + uv_mmask = (1UL << uv_hub_info->n_val) - 1; nblades = 0; last_blade = -1; for_each_online_node(node) { @@ -767,8 +755,7 @@ static int __init uv_bau_init(void) } uv_bau_table_bases = (struct bau_control **) kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); - if (!uv_bau_table_bases) - BUG(); + BUG_ON(!uv_bau_table_bases); last_blade = -1; for_each_online_node(node) { blade = uv_node_to_blade_id(node); diff --git a/include/asm-x86/uv/uv_bau.h b/include/asm-x86/uv/uv_bau.h index e52fec8..91ac0df 100644 --- a/include/asm-x86/uv/uv_bau.h +++ b/include/asm-x86/uv/uv_bau.h @@ -54,25 +54,25 @@ /* * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ -#define DESC_STATUS_IDLE 0 -#define DESC_STATUS_ACTIVE 1 -#define DESC_STATUS_DESTINATION_TIMEOUT 2 -#define DESC_STATUS_SOURCE_TIMEOUT 3 +#define DESC_STATUS_IDLE 0 +#define DESC_STATUS_ACTIVE 1 +#define DESC_STATUS_DESTINATION_TIMEOUT 2 +#define DESC_STATUS_SOURCE_TIMEOUT 3 /* * source side threshholds at which message retries print a warning */ -#define SOURCE_TIMEOUT_LIMIT 20 -#define DESTINATION_TIMEOUT_LIMIT 20 +#define SOURCE_TIMEOUT_LIMIT 20 +#define DESTINATION_TIMEOUT_LIMIT 20 /* * number of entries in the destination side payload queue */ -#define DESTINATION_PAYLOAD_QUEUE_SIZE 17 +#define DEST_Q_SIZE 17 /* * number of destination side software ack resources */ -#define DESTINATION_NUM_RESOURCES 8 +#define DEST_NUM_RESOURCES 8 #define MAX_CPUS_PER_NODE 32 /* * completion statuses for sending a TLB flush message @@ -197,10 +197,11 @@ struct bau_msg_header { }; /* + * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ -struct bau_activation_descriptor { +struct bau_desc { struct bau_target_nodemask distribution; /* * message template, consisting of header and payload: @@ -281,7 +282,7 @@ struct bau_sw_ack_status { * one on every node and per-cpu; to locate the software tables */ struct bau_control { - struct bau_activation_descriptor *descriptor_base; + struct bau_desc *descriptor_base; struct bau_payload_queue_entry *bau_msg_head; struct bau_payload_queue_entry *va_queue_first; struct bau_payload_queue_entry *va_queue_last; -- cgit v1.1 From b4c286e6af24a116228c8c9f58b9a9eb5b7c000a Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 18 Jun 2008 14:28:19 +0200 Subject: SGI UV: clean up arch/x86/kernel/tlb_uv.c Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_uv.c | 107 +++++++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d8705e9..7bdbf67 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -11,19 +11,22 @@ #include <linux/kernel.h> #include <asm/mmu_context.h> -#include <asm/idle.h> -#include <asm/genapic.h> -#include <asm/uv/uv_hub.h> #include <asm/uv/uv_mmrs.h> +#include <asm/uv/uv_hub.h> #include <asm/uv/uv_bau.h> +#include <asm/genapic.h> +#include <asm/idle.h> #include <asm/tsc.h> #include <mach_apic.h> -static struct bau_control **uv_bau_table_bases __read_mostly; -static int uv_bau_retry_limit __read_mostly; -static int uv_nshift __read_mostly; /* position of pnode (which is nasid>>1) */ -static unsigned long uv_mmask __read_mostly; +static struct bau_control **uv_bau_table_bases __read_mostly; +static int uv_bau_retry_limit __read_mostly; + +/* position of pnode (which is nasid>>1): */ +static int uv_nshift __read_mostly; + +static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); @@ -37,8 +40,8 @@ static DEFINE_PER_CPU(struct bau_control, bau_control); * be sent (the hardware will only do one reply per message). */ static void uv_reply_to_message(int resource, - struct bau_payload_queue_entry *msg, - struct bau_msg_status *msp) + struct bau_payload_queue_entry *msg, + struct bau_msg_status *msp) { unsigned long dw; @@ -55,11 +58,11 @@ static void uv_reply_to_message(int resource, * Other cpu's may come here at the same time for this message. */ static void uv_bau_process_message(struct bau_payload_queue_entry *msg, - int msg_slot, int sw_ack_slot) + int msg_slot, int sw_ack_slot) { - int cpu; unsigned long this_cpu_mask; struct bau_msg_status *msp; + int cpu; msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; cpu = uv_blade_processor_id(); @@ -96,11 +99,11 @@ static void uv_bau_process_message(struct bau_payload_queue_entry *msg, */ static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) { - int i; - int j; - int count = 0; struct bau_payload_queue_entry *msg; struct bau_msg_status *msp; + int count = 0; + int i; + int j; for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { @@ -111,7 +114,7 @@ static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) i, msg->address, msg->acknowledge_count, msg->number_of_cpus); for (j = 0; j < msg->number_of_cpus; j++) { - if (!((long)1 << j & msp-> seen_by.bits)) { + if (!((1L << j) & msp->seen_by.bits)) { count++; printk("%d ", j); } @@ -135,8 +138,7 @@ static int uv_examine_destinations(struct bau_target_nodemask *distribution) int count = 0; sender = smp_processor_id(); - for (i = 0; i < (sizeof(struct bau_target_nodemask) * BITSPERBYTE); - i++) { + for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { if (!bau_node_isset(i, distribution)) continue; count += uv_examine_destination(uv_bau_table_bases[i], sender); @@ -217,11 +219,11 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, { int completion_status = 0; int right_shift; - int bit; - int blade; int tries = 0; - unsigned long index; + int blade; + int bit; unsigned long mmr_offset; + unsigned long index; cycles_t time1; cycles_t time2; @@ -294,7 +296,7 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, * Returns 0 if some remote flushing remains to be done. */ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, - unsigned long va) + unsigned long va) { int i; int bit; @@ -356,12 +358,12 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, */ void uv_bau_message_interrupt(struct pt_regs *regs) { - struct bau_payload_queue_entry *pqp; - struct bau_payload_queue_entry *msg; struct bau_payload_queue_entry *va_queue_first; struct bau_payload_queue_entry *va_queue_last; + struct bau_payload_queue_entry *msg; struct pt_regs *old_regs = set_irq_regs(regs); - cycles_t time1, time2; + cycles_t time1; + cycles_t time2; int msg_slot; int sw_ack_slot; int fw; @@ -376,13 +378,14 @@ void uv_bau_message_interrupt(struct pt_regs *regs) local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); - pqp = va_queue_first = __get_cpu_var(bau_control).va_queue_first; + va_queue_first = __get_cpu_var(bau_control).va_queue_first; va_queue_last = __get_cpu_var(bau_control).va_queue_last; + msg = __get_cpu_var(bau_control).bau_msg_head; while (msg->sw_ack_vector) { count++; fw = msg->sw_ack_vector; - msg_slot = msg - pqp; + msg_slot = msg - va_queue_first; sw_ack_slot = ffs(fw) - 1; uv_bau_process_message(msg, msg_slot, sw_ack_slot); @@ -484,7 +487,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) * >0: retry limit */ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) + size_t count, loff_t *data) { long newmode; char optstr[64]; @@ -587,42 +590,48 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) bau_tabp = kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); BUG_ON(!bau_tabp); + bau_tabp->msg_statuses = kmalloc_node(sizeof(struct bau_msg_status) * DEST_Q_SIZE, GFP_KERNEL, node); BUG_ON(!bau_tabp->msg_statuses); + for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) bau_cpubits_clear(&msp->seen_by, (int) uv_blade_nr_possible_cpus(blade)); + bau_tabp->watching = kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); BUG_ON(!bau_tabp->watching); - for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) { + + for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) *ip = 0; - } + uv_bau_table_bases[blade] = bau_tabp; + return bau_tabsp; } /* * finish the initialization of the per-blade control structures */ -static void __init uv_table_bases_finish(int blade, int node, int cur_cpu, - struct bau_control *bau_tablesp, - struct bau_desc *adp) +static void __init +uv_table_bases_finish(int blade, int node, int cur_cpu, + struct bau_control *bau_tablesp, + struct bau_desc *adp) { - int i; struct bau_control *bcp; + int i; - for (i = cur_cpu; i < (cur_cpu + uv_blade_nr_possible_cpus(blade)); - i++) { + for (i = cur_cpu; i < cur_cpu + uv_blade_nr_possible_cpus(blade); i++) { bcp = (struct bau_control *)&per_cpu(bau_control, i); - bcp->bau_msg_head = bau_tablesp->va_queue_first; - bcp->va_queue_first = bau_tablesp->va_queue_first; - bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->watching = bau_tablesp->watching; - bcp->msg_statuses = bau_tablesp->msg_statuses; - bcp->descriptor_base = adp; + + bcp->bau_msg_head = bau_tablesp->va_queue_first; + bcp->va_queue_first = bau_tablesp->va_queue_first; + bcp->va_queue_last = bau_tablesp->va_queue_last; + bcp->watching = bau_tablesp->watching; + bcp->msg_statuses = bau_tablesp->msg_statuses; + bcp->descriptor_base = adp; } } @@ -643,14 +652,18 @@ uv_activation_descriptor_init(int node, int pnode) adp = (struct bau_desc *) kmalloc_node(16384, GFP_KERNEL, node); BUG_ON(!adp); + pa = __pa((unsigned long)adp); n = pa >> uv_nshift; m = pa & uv_mmask; + mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE); - if (mmr_image) + if (mmr_image) { uv_write_global_mmr64(pnode, (unsigned long) UVH_LB_BAU_SB_DESCRIPTOR_BASE, (n << UV_DESC_BASE_PNODE_SHIFT | m)); + } + for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; @@ -669,16 +682,17 @@ uv_activation_descriptor_init(int node, int pnode) /* * initialize the destination side's receiving buffers */ -static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node, - int pnode, struct bau_control *bau_tablesp) +static struct bau_payload_queue_entry * __init +uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) { - char *cp; struct bau_payload_queue_entry *pqp; + char *cp; pqp = (struct bau_payload_queue_entry *) kmalloc_node( (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), GFP_KERNEL, node); BUG_ON(!pqp); + cp = (char *)pqp + 31; pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); bau_tablesp->va_queue_first = pqp; @@ -694,6 +708,7 @@ static struct bau_payload_queue_entry * __init uv_payload_queue_init(int node, (unsigned long) uv_physnodeaddr(bau_tablesp->va_queue_last)); memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); + return pqp; } @@ -756,6 +771,7 @@ static int __init uv_bau_init(void) uv_bau_table_bases = (struct bau_control **) kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); BUG_ON(!uv_bau_table_bases); + last_blade = -1; for_each_online_node(node) { blade = uv_node_to_blade_id(node); @@ -767,6 +783,7 @@ static int __init uv_bau_init(void) } set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); uv_enable_timeouts(); + return 0; } __initcall(uv_bau_init); -- cgit v1.1 From d400524affeb84bdfc2b00cd561fbfb8c09dadd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 18 Jun 2008 14:51:57 +0200 Subject: SGI UV: TLB shootdown using broadcast assist unit, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/tlb_uv.c: In function ‘uv_table_bases_init': arch/x86/kernel/tlb_uv.c:612: error: ‘bau_tabsp' undeclared (first use in this function) arch/x86/kernel/tlb_uv.c:612: error: (Each undeclared identifier is reported only once arch/x86/kernel/tlb_uv.c:612: error: for each function it appears in.) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7bdbf67..b362913 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -609,7 +609,7 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) uv_bau_table_bases[blade] = bau_tabp; - return bau_tabsp; + return bau_tabp; } /* -- cgit v1.1 From b6df1b8bc1250191cfee15627697111c1cbda53f Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Thu, 19 Jun 2008 21:51:05 -0500 Subject: x86: fix stack overflow for large values of MAX_APICS physid_mask_of_physid() causes a huge stack (12k) to be created if the number of APICS is large. Replace physid_mask_of_physid() with a new function that does not create large stacks. This is a problem only on large x86_64 systems. this paves the way to increase MAX_APICS. Signed-off-by: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org Cc: mingo@elte.hu Cc: tglx@linutronix.de Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- arch/x86/kernel/smpboot.c | 5 ++--- include/asm-x86/mpspec.h | 7 +++++++ 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d5767cb..3f13a1a 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1269,7 +1269,7 @@ int __init APIC_init_uniprocessor(void) #ifdef CONFIG_CRASH_DUMP boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); #endif - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); setup_local_APIC(); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd..5e22f8d 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -942,7 +942,7 @@ int __init APIC_init_uniprocessor(void) verify_local_APIC(); - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); setup_local_APIC(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e1cece..664a5db 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1091,10 +1091,9 @@ static __init void disable_smp(void) smpboot_clear_io_apic_irqs(); #endif if (smp_found_config) - phys_cpu_present_map = - physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else - phys_cpu_present_map = physid_mask_of_physid(0); + physid_set_mask_of_physid(0, &phys_cpu_present_map); map_cpu_to_logical_apicid(); cpu_set(0, per_cpu(cpu_sibling_map, 0)); cpu_set(0, per_cpu(cpu_core_map, 0)); diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index 57a991b..b69e7ba 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -101,6 +101,7 @@ typedef struct physid_mask physid_mask_t; __physid_mask; \ }) +/* Note: will create very large stack frames if physid_mask_t is big */ #define physid_mask_of_physid(physid) \ ({ \ physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ @@ -108,6 +109,12 @@ typedef struct physid_mask physid_mask_t; __physid_mask; \ }) +static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +{ + physids_clear(*map); + physid_set(physid, *map); +} + #define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } #define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } -- cgit v1.1 From ab9c0bb8a8c1d71dd303abdaa61ec496128e2fbe Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Mon, 16 Jun 2008 12:09:10 -0500 Subject: x86: increase MAX_APICS for very large x86-64 configs Increase the maximum number of apics when running very large configurations. This patch has no affect on most systems. The patch has no effect on any 32-bit kernel. It adds ~4k to the size of 64-bit kernels but only if NR_CPUS > 255. Signed-off-by: Jack Steiner <steiner@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mpspec_def.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/mpspec_def.h b/include/asm-x86/mpspec_def.h index dc6ef85..38d1e73 100644 --- a/include/asm-x86/mpspec_def.h +++ b/include/asm-x86/mpspec_def.h @@ -17,10 +17,11 @@ # define MAX_MPC_ENTRY 1024 # define MAX_APICS 256 #else -/* - * A maximum of 255 APICs with the current APIC ID architecture. - */ -# define MAX_APICS 255 +# if NR_CPUS <= 255 +# define MAX_APICS 255 +# else +# define MAX_APICS 32768 +# endif #endif struct intel_mp_floating { -- cgit v1.1 From cef53278682eb2604cbd99de64cdb59a8b35235a Mon Sep 17 00:00:00 2001 From: Cliff Wickman <cpw@sgi.com> Date: Thu, 19 Jun 2008 11:16:24 -0500 Subject: x86, SGI UV: TLB shootdown using broadcast assist unit, v6 v6: 6/19 close the security hole in uv_ptc_proc_write()) > Found a potential security hole while doing that: > static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, > size_t count, loff_t *data) > if (copy_from_user(optstr, user, count)) > return -EFAULT; > > is count guaranteed to never be larger than 64? is fixed below. It adds tlb_uv.o to the Makefile. Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: mingo@elte.hu Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_uv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index b362913..c503b7f 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -492,6 +492,8 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, long newmode; char optstr[64]; + if (count > 64) + return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; optstr[count - 1] = '\0'; -- cgit v1.1 From e7eb8726d0e144f0925972c4ecee945e91a42753 Mon Sep 17 00:00:00 2001 From: Cliff Wickman <cpw@sgi.com> Date: Mon, 23 Jun 2008 08:32:25 -0500 Subject: x86, SGI UV: uv_ptc_proc_write fix Someone could write 0 bytes to /proc/sgi_uv/ptc_statistics, causing optstr[count - 1] = '\0'; to write to who-knows-where. (Andi Kleen noticed this need from a patch I sent for similar code in the ia64 world (sn2_ptc_proc_write()).) (count less than zero is not possible here, as count is unsigned) Signed-off-by: Cliff Wickman <cpw@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index c503b7f..d0fbb77 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -492,7 +492,7 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, long newmode; char optstr[64]; - if (count > 64) + if (count == 0 || count > sizeof(optstr)) return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; -- cgit v1.1 From 1ecd27657b735128a728ebf0c31fce5e1456718a Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Fri, 20 Jun 2008 15:38:22 +0200 Subject: x86: unify crashkernel reservation for 32 and 64 bit This patch moves the reserve_crashkernel() to setup.c and removes the architecture-specific version. Both versions were more or less the same. I tested it on both x86-64 and i386, with CONFIG_KEXEC on and off (so that it compiles). Signed-off-by: Bernhard Walle <bwalle@suse.de> Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 53 ---------------------------------------- arch/x86/kernel/setup_64.c | 40 ------------------------------ include/asm-x86/setup.h | 3 +++ 4 files changed, 64 insertions(+), 93 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ebb0a2b..c5330f6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/percpu.h> +#include <linux/kexec.h> #include <asm/smp.h> #include <asm/percpu.h> #include <asm/sections.h> @@ -11,6 +12,7 @@ #include <asm/topology.h> #include <asm/mpspec.h> #include <asm/apicdef.h> +#include <asm/highmem.h> #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; @@ -404,3 +406,62 @@ EXPORT_SYMBOL(node_to_cpumask); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ #endif /* X86_64_NUMA */ + + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base <= 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + } +} +#else +void __init reserve_crashkernel(void) +{} +#endif + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index a9b19ad..369d0fe 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -429,59 +429,6 @@ extern unsigned long __init setup_memory(void); extern void zone_sizes_init(void); #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif - - return total << PAGE_SHIFT; -} - -#ifdef CONFIG_KEXEC -static void __init reserve_crashkernel(void) -{ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; - int ret; - - total_mem = get_total_mem(); - - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); - return; - } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); - return; - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } -} -#else -static inline void __init reserve_crashkernel(void) -{} -#endif - #ifdef CONFIG_BLK_DEV_INITRD static bool do_relocate_initrd = false; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 16ef53a..504caea 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -228,46 +228,6 @@ static inline void copy_edd(void) } #endif -#ifdef CONFIG_KEXEC -static void __init reserve_crashkernel(void) -{ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; - int ret; - - total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; - - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret == 0 && crash_size) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); - return; - } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); - return; - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } -} -#else -static inline void __init reserve_crashkernel(void) -{} -#endif - /* * setup_arch - architecture-specific boot-time initializations * diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index e14b6e7..b434bdd 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -8,6 +8,9 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); +/* Crashkernel reservation */ +void reserve_crashkernel(void); + #ifndef CONFIG_PARAVIRT #define paravirt_post_allocator_init() do {} while (0) #endif -- cgit v1.1 From a7bf0bd5e6af7fe69342dabf2a3b721f0163469a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 28 May 2008 15:02:14 +0100 Subject: build: add __page_aligned_data and __page_aligned_bss Making a variable page-aligned by using __attribute__((section(".data.page_aligned"))) is fragile because if sizeof(variable) is not also a multiple of page size, it leaves variables in the remainder of the section unaligned. This patch introduces two new qualifiers, __page_aligned_data and __page_aligned_bss to set the section *and* the alignment of variables. This makes page-aligned variables more robust because the linker will make sure they're aligned properly. Unfortunately it requires *all* page-aligned data to use these macros... Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup64.c | 2 +- arch/x86/mm/ioremap.c | 3 +-- include/linux/linkage.h | 4 ++++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 631ea6c..fc1a56d 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -40,7 +40,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; unsigned long __supported_pte_mask __read_mostly = ~0UL; EXPORT_SYMBOL_GPL(__supported_pte_mask); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 416ea41..0561fde 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -394,8 +394,7 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __section(.bss.page_aligned); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 2119610..9fd1f85 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -1,6 +1,7 @@ #ifndef _LINUX_LINKAGE_H #define _LINUX_LINKAGE_H +#include <linux/compiler.h> #include <asm/linkage.h> #ifdef __cplusplus @@ -17,6 +18,9 @@ # define asmregparm #endif +#define __page_aligned_data __section(.data.page_aligned) __aligned(PAGE_SIZE) +#define __page_aligned_bss __section(.bss.page_aligned) __aligned(PAGE_SIZE) + /* * This is used by architectures to keep arguments on the stack * untouched by the compiler by keeping them live until the end. -- cgit v1.1 From 9cf4f298e29abba25c16679fe7be70898223167e Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Tue, 27 May 2008 18:22:54 -0700 Subject: x86: use stack_start in x86_64 call x86_64's init_rsp stack_start, just as i386 does. Put a zeroed stack segment for consistency. With this, we can eliminate one ugly ifdef in smpboot.c. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/head_64.S | 5 +++-- arch/x86/kernel/smpboot.c | 7 +------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 36af01f..a81f468 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -86,7 +86,7 @@ int acpi_save_state_mem(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; - init_rsp = (unsigned long)temp_stack + 4096; + stack_start.sp = temp_stack + 4096; initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; #endif /* CONFIG_64BIT */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 263b9d1..918a271 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -191,7 +191,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq init_rsp(%rip),%rsp + movq stack_start(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -252,8 +252,9 @@ ENTRY(secondary_startup_64) .quad x86_64_start_kernel __FINITDATA - ENTRY(init_rsp) + ENTRY(stack_start) .quad init_thread_union+THREAD_SIZE-8 + .word 0 bad_address: jmp bad_address diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d3ad4e0..a71e3ca 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -714,11 +714,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, -#ifdef CONFIG_X86_64 - (unsigned long)init_rsp); -#else (unsigned long)stack_start.sp); -#endif /* * Run STARTUP IPI loop. @@ -905,15 +901,14 @@ do_rest: early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); c_idle.idle->thread.ip = (unsigned long) start_secondary; /* Stack for startup_32 can be just as for start_secondary onwards */ - stack_start.sp = (void *) c_idle.idle->thread.sp; irq_ctx_init(cpu); #else cpu_pda(cpu)->pcurrent = c_idle.idle; - init_rsp = c_idle.idle->thread.sp; load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); initial_code = (unsigned long)start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); #endif + stack_start.sp = (void *) c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); -- cgit v1.1 From 736f12bff9d9e7b4e895c64f73b190c8383fc2a1 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Tue, 27 May 2008 20:14:51 -0700 Subject: x86: don't use gdt_page openly. There's a macro available for that. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index cf37d20..dc7c05e 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1119,7 +1119,7 @@ void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; -- cgit v1.1 From a939098afcfa5f81d3474782ec15c6d114e57763 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 16:19:53 -0700 Subject: x86: move x86_64 gdt closer to i386 i386 and x86_64 used two different schemes for maintaining the gdt. With this patch, x86_64 initial gdt table is defined in a .c file, same way as i386 is now. Also, we call it "gdt_page", and the descriptor, "early_gdt_descr". This way we achieve common naming, which can allow for more code integration. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 48 +++++----------------------------------- arch/x86/kernel/setup64.c | 5 +---- arch/x86/kernel/setup_64.c | 19 ++++++++++++++-- arch/x86/kernel/smpboot.c | 12 +++------- arch/x86/kernel/x8664_ksyms_64.c | 5 ----- include/asm-x86/desc.h | 24 +++++++++----------- include/asm-x86/segment.h | 23 ++++++++++--------- 7 files changed, 48 insertions(+), 88 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 918a271..32f5a11 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -203,7 +203,7 @@ ENTRY(secondary_startup_64) * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr(%rip) + lgdt early_gdt_descr(%rip) /* set up data segments. actually 0 would do too */ movl $__KERNEL_DS,%eax @@ -391,54 +391,16 @@ NEXT_PAGE(level2_spare_pgt) .data .align 16 - .globl cpu_gdt_descr -cpu_gdt_descr: - .word gdt_end-cpu_gdt_table-1 -gdt: - .quad cpu_gdt_table -#ifdef CONFIG_SMP - .rept NR_CPUS-1 - .word 0 - .quad 0 - .endr -#endif + .globl early_gdt_descr +early_gdt_descr: + .word GDT_ENTRIES*8-1 + .quad per_cpu__gdt_page ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ - - .section .data.page_aligned, "aw" - .align PAGE_SIZE - -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ - .quad 0x00af9b000000ffff /* __KERNEL_CS */ - .quad 0x00cf93000000ffff /* __KERNEL_DS */ - .quad 0x00cffb000000ffff /* __USER32_CS */ - .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affb000000ffff /* __USER_CS */ - .quad 0x0 /* unused */ - .quad 0,0 /* TSS */ - .quad 0,0 /* LDT */ - .quad 0,0,0 /* three TLS descriptors */ - .quad 0x0000f40000000000 /* node/CPU stored in limit */ -gdt_end: - /* asm/segment.h:GDT_ENTRIES must match this */ - /* This should be a multiple of the cache line size */ - /* GDTs of other CPUs are now dynamically allocated */ - - /* zero the remaining page */ - .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 - .section .bss, "aw", @nobits .align L1_CACHE_BYTES ENTRY(idt_table) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index fc1a56d..70ff071 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -202,11 +202,8 @@ void __cpuinit cpu_init (void) * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ - if (cpu) - memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr[cpu].size = GDT_SIZE; - load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); + switch_to_new_gdt(); load_idt((const struct desc_ptr *)&idt_descr); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 504caea..a93300d 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -81,8 +81,6 @@ #define ARCH_SETUP #endif -#include "cpu/cpu.h" - /* * Machine setup.. */ @@ -228,6 +226,23 @@ static inline void copy_edd(void) } #endif +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) __init memory_setup(void) +{ + machine_specific_memory_setup(); +} + +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} + /* * setup_arch - architecture-specific boot-time initializations * diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a71e3ca..fe2bd51 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -849,14 +849,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; INIT_WORK(&c_idle.work, do_fork_idle); -#ifdef CONFIG_X86_64 - /* allocate memory for gdts of secondary cpus. Hotplug is considered */ - if (!cpu_gdt_descr[cpu].address && - !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { - printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); - return -1; - } +#ifdef CONFIG_X86_64 /* Allocate node local memory for AP pdas */ if (cpu > 0) { boot_error = get_local_pda(cpu); @@ -898,7 +892,6 @@ do_rest: #ifdef CONFIG_X86_32 per_cpu(current_task, cpu) = c_idle.idle; init_gdt(cpu); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); c_idle.idle->thread.ip = (unsigned long) start_secondary; /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); @@ -908,6 +901,7 @@ do_rest: initial_code = (unsigned long)start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); #endif + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); stack_start.sp = (void *) c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ @@ -1252,8 +1246,8 @@ void __init native_smp_prepare_boot_cpu(void) int me = smp_processor_id(); #ifdef CONFIG_X86_32 init_gdt(me); - switch_to_new_gdt(); #endif + switch_to_new_gdt(); /* already set me in cpu_online_map in boot_cpu_init() */ cpu_set(me, cpu_callout_map); per_cpu(cpu_state, me) = CPU_ONLINE; diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index f6c05d0..2f306a8 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -53,8 +53,3 @@ EXPORT_SYMBOL(init_level4_pgt); EXPORT_SYMBOL(load_gs_index); EXPORT_SYMBOL(_proxy_pda); - -#ifdef CONFIG_PARAVIRT -/* Virtualized guests may want to use it */ -EXPORT_SYMBOL_GPL(cpu_gdt_descr); -#endif diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index b3875d4..07f9f2b 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -29,11 +29,17 @@ static inline void fill_ldt(struct desc_struct *desc, extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + #ifdef CONFIG_X86_64 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; -extern struct desc_ptr cpu_gdt_descr[]; -/* the cpu gdt accessor */ -#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address) static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) @@ -51,16 +57,6 @@ static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, } #else -struct gdt_page { - struct desc_struct gdt[GDT_ENTRIES]; -} __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU(struct gdt_page, gdt_page); - -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) -{ - return per_cpu(gdt_page, cpu).gdt; -} - static inline void pack_gate(gate_desc *gate, unsigned char type, unsigned long base, unsigned dpl, unsigned flags, unsigned short seg) diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h index ed5131d..dfc8601 100644 --- a/include/asm-x86/segment.h +++ b/include/asm-x86/segment.h @@ -61,18 +61,14 @@ #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) #define GDT_ENTRY_DEFAULT_USER_CS 14 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) #define GDT_ENTRY_DEFAULT_USER_DS 15 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) #define GDT_ENTRY_KERNEL_BASE 12 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) @@ -139,10 +135,11 @@ #else #include <asm/cache.h> -#define __KERNEL_CS 0x10 -#define __KERNEL_DS 0x18 +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 -#define __KERNEL32_CS 0x08 +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) /* * we cannot use the same code segment descriptor for user and kernel @@ -150,10 +147,10 @@ * The segment offset needs to contain a RPL. Grr. -AK * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) */ - -#define __USER32_CS 0x23 /* 4*8+3 */ -#define __USER_DS 0x2b /* 5*8+3 */ -#define __USER_CS 0x33 /* 6*8+3 */ +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3) #define __USER32_DS __USER_DS #define GDT_ENTRY_TSS 8 /* needs two entries */ @@ -175,6 +172,10 @@ #endif +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) #ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif -- cgit v1.1 From e3f77edfc1d0beb7b10f9f31d9e39206f7dbef7b Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 12:57:02 -0300 Subject: x86: use initial_code for i386 x86_64 jumps to whatever is written in "initial_code" symbol, instead of a fixed address. Do it for i386 too. It will allow us to integrate more of the smp boot code. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_32.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b98b338..ffb73a5 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -455,7 +455,10 @@ is386: movl $2,%ecx # set MP jmp initialize_secondary # all other CPUs call initialize_secondary 1: #endif /* CONFIG_SMP */ - jmp i386_start_kernel + jmp *(initial_code) +.align 4 +ENTRY(initial_code) + .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. -- cgit v1.1 From 3e9704739daf46a8ba6593d749c67b5f7cd633d2 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 13:01:54 -0300 Subject: x86: boot secondary cpus through initial_code remove "initialize_secondary". Boot both architectures via initial_code. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/smpboot.c | 25 +------------------------ 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ffb73a5..f67e934 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -452,7 +452,7 @@ is386: movl $2,%ecx # set MP je 1f movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu - jmp initialize_secondary # all other CPUs call initialize_secondary + movl (stack_start), %esp 1: #endif /* CONFIG_SMP */ jmp *(initial_code) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe2bd51..2a0d39f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -349,28 +349,6 @@ static void __cpuinit start_secondary(void *unused) cpu_idle(); } -#ifdef CONFIG_X86_32 -/* - * Everything has been set up for the secondary - * CPUs - they just need to reload everything - * from the task structure - * This function must not return. - */ -void __devinit initialize_secondary(void) -{ - /* - * We don't actually need to load the full TSS, - * basically just the stack pointer and the ip. - */ - - asm volatile( - "movl %0,%%esp\n\t" - "jmp *%1" - : - :"m" (current->thread.sp), "m" (current->thread.ip)); -} -#endif - static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 @@ -892,16 +870,15 @@ do_rest: #ifdef CONFIG_X86_32 per_cpu(current_task, cpu) = c_idle.idle; init_gdt(cpu); - c_idle.idle->thread.ip = (unsigned long) start_secondary; /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); #else cpu_pda(cpu)->pcurrent = c_idle.idle; load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); - initial_code = (unsigned long)start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + initial_code = (unsigned long)start_secondary; stack_start.sp = (void *) c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ -- cgit v1.1 From 0f385d1ddd0952e01a968bfa5512378ad23de6df Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 17:09:53 -0700 Subject: x86: clearing io_apic harmless for x86_64 so remove ifdef. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2a0d39f..a74a261 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1052,9 +1052,8 @@ static __init void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); -#ifdef CONFIG_X86_32 smpboot_clear_io_apic_irqs(); -#endif + if (smp_found_config) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else -- cgit v1.1 From 86e430edf462e872ecfab28d6b8619be5ab9c300 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 20:05:46 -0700 Subject: x86: remove ifdef from stepping The stepping won't affect x86_64, since there are not x86_64 k7's or pentiums. So, although it adds to the binary size, remove the ifdef for smoother integration Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a74a261..6f9a31a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -351,7 +351,6 @@ static void __cpuinit start_secondary(void *unused) static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_32 /* * Mask B, Pentium, but not Pentium MMX */ @@ -401,7 +400,6 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit smp_checks(void) -- cgit v1.1 From 3fde690011a84e19f98f77bfaa349b2119ddd2d2 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 20:34:19 -0700 Subject: x86: change __setup_vector_irq with setup_vector_irq We create a version of it for i386, and then take the CONFIG_X86_64 ifdef out of the game. We could create a __setup_vector_irq for i386, but it would incur in an unnecessary lock taking. Moreover, it is better practice to only export setup_vector_irq anyway. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/io_apic_32.c | 5 +++++ arch/x86/kernel/io_apic_64.c | 9 ++++++++- arch/x86/kernel/smpboot.c | 11 ++--------- include/asm-x86/hw_irq.h | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index fedb3b1..d6af301 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1207,6 +1207,11 @@ static int assign_irq_vector(int irq) return vector; } + +void setup_vector_irq(int cpu) +{ +} + static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 2eba4f4..08c4875 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -801,7 +801,7 @@ static void __clear_irq_vector(int irq) cpus_clear(cfg->domain); } -void __setup_vector_irq(int cpu) +static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ @@ -824,6 +824,13 @@ void __setup_vector_irq(int cpu) } } +void setup_vector_irq(int cpu) +{ + spin_lock(&vector_lock); + __setup_vector_irq(smp_processor_id()); + spin_unlock(&vector_lock); +} + static struct irq_chip ioapic_chip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6f9a31a..e09f312 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -329,15 +329,8 @@ static void __cpuinit start_secondary(void *unused) * smp_call_function(). */ lock_ipi_call_lock(); -#ifdef CONFIG_X86_64 - spin_lock(&vector_lock); - - /* Setup the per cpu irq handling data structures */ - __setup_vector_irq(smp_processor_id()); - /* - * Allow the master to continue. - */ - spin_unlock(&vector_lock); +#ifdef CONFIG_X86_IO_APIC + setup_vector_irq(smp_processor_id()); #endif cpu_set(smp_processor_id(), cpu_online_map); unlock_ipi_call_lock(); diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 1428b41..18f067c 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -97,9 +97,9 @@ extern void (*const interrupt[NR_IRQS])(void); #else typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -extern void __setup_vector_irq(int cpu); extern spinlock_t vector_lock; #endif +extern void setup_vector_irq(int cpu); #endif /* !ASSEMBLY_ */ -- cgit v1.1 From b5841765a2e735a38612c4e4a82170c33d701b3c Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 28 May 2008 13:38:28 -0300 Subject: x86: provide connect_bsp_APIC for x86_64 Although it is not really needed, we provide it to get closer to i386. ifdefs around it are removed in smpboot.c Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_64.c | 10 ++++++++++ arch/x86/kernel/smpboot.c | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d9d663f..5279dc9 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -918,6 +918,8 @@ int __init APIC_init_uniprocessor(void) verify_local_APIC(); + connect_bsp_APIC(); + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); @@ -999,6 +1001,14 @@ asmlinkage void smp_error_interrupt(void) irq_exit(); } +/** + * * connect_bsp_APIC - attach the APIC to the interrupt system + * */ +void __init connect_bsp_APIC(void) +{ + enable_apic_mode(); +} + void disconnect_bsp_APIC(int virt_wire_setup) { /* Go back to Virtual Wire compatibility mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e09f312..a569f06 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,7 @@ static int __init smp_sanity_check(unsigned max_cpus) localise_nmi_watchdog(); -#ifdef CONFIG_X86_32 connect_bsp_APIC(); -#endif setup_local_APIC(); end_local_APIC_setup(); return -1; @@ -1173,9 +1171,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } preempt_enable(); -#ifdef CONFIG_X86_32 connect_bsp_APIC(); -#endif + /* * Switch from PIC to APIC mode. */ -- cgit v1.1 From 78e622705c69da9649ba87071d8de85054b62df8 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 4 Jun 2008 02:03:07 -0300 Subject: x86: change naming to match x86_64 Change unmap_cpu_to_logical_apicid to numa_remove_cpu. Besides being shorter, it is the same name x86_64 uses. We can save an ifdef in the code this way. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a569f06..b99c386 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -181,13 +181,12 @@ static void map_cpu_to_logical_apicid(void) map_cpu_to_node(cpu, node); } -static void unmap_cpu_to_logical_apicid(int cpu) +static void numa_remove_cpu(int cpu) { cpu_2_logical_apicid[cpu] = BAD_APICID; unmap_cpu_to_node(cpu); } #else -#define unmap_cpu_to_logical_apicid(cpu) do {} while (0) #define map_cpu_to_logical_apicid() do {} while (0) #endif @@ -946,10 +945,7 @@ restore_state: if (boot_error) { /* Try to put things back the way they were before ... */ - unmap_cpu_to_logical_apicid(cpu); -#ifdef CONFIG_X86_64 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ -#endif cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpu_clear(cpu, cpu_present_map); @@ -1244,7 +1240,7 @@ void cpu_exit_clear(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); - unmap_cpu_to_logical_apicid(cpu); + numa_remove_cpu(cpu); } # endif /* CONFIG_X86_32 */ -- cgit v1.1 From b553a1e0ff48bd66fd18f705370e47c0b4ecea61 Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 4 Jun 2008 02:05:03 -0300 Subject: x86: remove cpu from maps during cpu disable, take cpus out of all maps in i386, instead of just the online map. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b99c386..820c23d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1319,13 +1319,11 @@ __init void prefill_possible_map(void) static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); -#ifdef CONFIG_X86_64 cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ clear_bit(cpu, (unsigned long *)&cpu_initialized); numa_remove_cpu(cpu); -#endif } int __cpu_disable(void) -- cgit v1.1 From 1481a3dd42c21ac4a8b9497cb9f5df816d6b064f Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 4 Jun 2008 15:35:03 -0300 Subject: x86: move cpu_exit_clear to process_32.c Take it out of smpboot.c, and move it to process_32.c, closer to its only user. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process_32.c | 16 ++++++++++++++++ arch/x86/kernel/smpboot.c | 19 +------------------ include/asm-x86/numa_32.h | 1 + include/asm-x86/smp.h | 1 - 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c2a11d7..9a139f6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -74,6 +74,22 @@ unsigned long thread_saved_pc(struct task_struct *tsk) #ifdef CONFIG_HOTPLUG_CPU #include <asm/nmi.h> + +static void cpu_exit_clear(void) +{ + int cpu = raw_smp_processor_id(); + + idle_task_exit(); + + cpu_uninit(); + irq_ctx_exit(cpu); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + + numa_remove_cpu(cpu); +} + /* We don't actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 820c23d..67af727 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -181,7 +181,7 @@ static void map_cpu_to_logical_apicid(void) map_cpu_to_node(cpu, node); } -static void numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { cpu_2_logical_apicid[cpu] = BAD_APICID; unmap_cpu_to_node(cpu); @@ -1227,23 +1227,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_HOTPLUG_CPU -# ifdef CONFIG_X86_32 -void cpu_exit_clear(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - cpu_uninit(); - irq_ctx_exit(cpu); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); -} -# endif /* CONFIG_X86_32 */ - static void remove_siblinginfo(int cpu) { int sibling; diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h index a02674f..ed6c88e 100644 --- a/include/asm-x86/numa_32.h +++ b/include/asm-x86/numa_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_32_NUMA_H 1 extern int pxm_to_nid(int pxm); +extern void numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA extern void __init remap_numa_kva(void); diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index fc10073..fad45f6 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -188,7 +188,6 @@ static inline int hard_smp_processor_id(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_HOTPLUG_CPU -extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif -- cgit v1.1 From 7f6cbc905ee22c457e0dcd0bba9d4affbc290a6f Mon Sep 17 00:00:00 2001 From: Glauber Costa <gcosta@redhat.com> Date: Wed, 4 Jun 2008 23:05:39 -0700 Subject: x86: take load_sp0 out of smpboot.c there's no particular reason to do load_sp0 in different places for i386 and x86_64. They should all be in cpu_init. Right now, cpu_init itself is not integrated, but with this patch, the code becomes closer to each other, making in easier to integrate when the time comes. Furthermore, although doing it in do_boot_cpu for x86_64 is fine, since it's only a copy, load_sp0 should be executed in the cpu it refers to anyway. Signed-off-by: Glauber Costa <gcosta@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup64.c | 1 + arch/x86/kernel/smpboot.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 70ff071..151d315 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -247,6 +247,7 @@ void __cpuinit cpu_init (void) BUG(); enter_lazy_tlb(&init_mm, me); + load_sp0(t, ¤t->thread); set_tss_desc(cpu, t); load_TR_desc(); load_LDT(&init_mm.context); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 67af727..3b48d1f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -864,7 +864,6 @@ do_rest: irq_ctx_init(cpu); #else cpu_pda(cpu)->pcurrent = c_idle.idle; - load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); clear_tsk_thread_flag(c_idle.idle, TIF_FORK); #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); -- cgit v1.1 From 1ea598c29748a559a0086a84a016886d786e6272 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 13 Jun 2008 20:31:54 +0200 Subject: x86: fix sleep.c build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/acpi/sleep.c: In function ‘acpi_save_state_mem': arch/x86/kernel/acpi/sleep.c:75: error: ‘stack_start' undeclared (first use in this function) arch/x86/kernel/acpi/sleep.c:75: error: (Each undeclared identifier is reported only once arch/x86/kernel/acpi/sleep.c:75: error: for each function it appears in.) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/sleep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a81f468..e6a4b56 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -86,7 +86,9 @@ int acpi_save_state_mem(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; +#ifdef CONFIG_SMP stack_start.sp = temp_stack + 4096; +#endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; #endif /* CONFIG_64BIT */ -- cgit v1.1 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 6 ++++-- drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++++--- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056..a0484ad 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 66c0fd2..bb06423 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, } #ifdef CONFIG_DMAR_GFX_WA -extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +struct iommu_prepare_data { + struct pci_dev *pdev; + int ret; +}; + +static int __init iommu_prepare_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct iommu_prepare_data *data; + + data = (struct iommu_prepare_data *)datax; + + data->ret = iommu_prepare_identity_map(data->pdev, + start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + return data->ret; + +} + +static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +{ + int nid; + struct iommu_prepare_data data; + + data.pdev = pdev; + data.ret = 0; + + for_each_online_node(nid) { + work_with_active_regions(nid, iommu_prepare_work_fn, &data); + if (data.ret) + return data.ret; + } + return data.ret; +} + static void __init iommu_prepare_gfx_mapping(void) { struct pci_dev *pdev = NULL; - u64 base, size; - int slot; int ret; for_each_pci_dev(pdev) { @@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void) continue; printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", pci_name(pdev)); - slot = arch_get_ram_range(0, &base, &size); - while (slot >= 0) { - ret = iommu_prepare_identity_map(pdev, - base, base + size); - if (ret) - goto error; - slot = arch_get_ram_range(slot, &base, &size); - } - continue; -error: - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + ret = iommu_prepare_with_active_regions(pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); } } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d647b2..cf1cd3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3a..e25b6b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.1 From be5bf9fa1c327fa6fd6e7ba44665437dd558dfe3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 14:54:46 -0700 Subject: xen: reserve Xen-specific memory in e820 map Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/setup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a295758..dc2ca8a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -40,9 +40,22 @@ char * __init xen_memory_setup(void) max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); e820.nr_map = 0; + e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + /* + * Reserve Xen bits: + * - mfn_list + * - xen_start_info + * See comment above "struct start_info" in <xen/interface/xen.h> + */ + e820_add_region(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list, + E820_RESERVED); + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + return "Xen"; } -- cgit v1.1 From b792c755907cffceab393585b626ef2553c38538 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 14:54:49 -0700 Subject: xen: reserve ISA space in e820 map [ TODO: release the underlying memory back to Xen. ] Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Ian Campbell <ian.campbell@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/setup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dc2ca8a..e0a3959 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -13,6 +13,7 @@ #include <asm/vdso.h> #include <asm/e820.h> #include <asm/setup.h> +#include <asm/acpi.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> @@ -41,8 +42,15 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM); + e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + + /* + * Even though this is normal, usable memory under Xen, reserve + * ISA memory anyway because too many things think they can poke + * about in there. + */ + e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, + E820_RESERVED); /* * Reserve Xen bits: -- cgit v1.1 From 88a6846c70ad6bf33a545d554ace801d69e8a1a5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 16 Jun 2008 14:54:51 -0700 Subject: xen: set max_pfn_mapped Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Cc: the arch/x86 maintainers <x86@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 316623c..76ad1ef 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1304,6 +1304,7 @@ asmlinkage void __init xen_start_kernel(void) init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; init_mm.pgd = pgd; /* use the Xen pagetables to start */ -- cgit v1.1 From 3c999f142665265afd0fe9190204dd051f17e505 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 20 Jun 2008 16:11:20 -0700 Subject: x86: check command line when CONFIG_X86_MPPARSE is not set, v2 if acpi=off, acpi=noirq and pci=noacpi, we need to disable apic. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: "Maciej W. Rozycki" <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 14 ++++++++++++++ arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/setup.c | 4 ++++ arch/x86/kernel/setup_32.c | 5 +++++ arch/x86/kernel/setup_64.c | 9 +++++++++ include/asm-x86/apic.h | 6 +++++- include/linux/acpi.h | 6 ++++++ 7 files changed, 44 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6516359..5c01076 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1787,6 +1787,20 @@ static int __init parse_pci(char *arg) } early_param("pci", parse_pci); +int __init acpi_mps_check(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) +/* mptable code is not built-in*/ + if (acpi_disabled || acpi_noirq) { + printk(KERN_WARNING "MPS support code is not built-in.\n" + "Using acpi=off or acpi=noirq or pci=noacpi " + "may have problem\n"); + return 1; + } +#endif + return 0; +} + #ifdef CONFIG_X86_IO_APIC static int __init parse_acpi_skip_timer_override(char *arg) { diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index dd8de26..4932d78 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -57,7 +57,7 @@ unsigned long mp_lapic_addr; * * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata; +int enable_local_apic; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c5330f6..56aee55 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -161,6 +161,10 @@ void __init setup_per_cpu_areas(void) char *ptr; int cpu; + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); #else diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 369d0fe..cad4e89 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -677,6 +677,11 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (acpi_mps_check()){ + enable_local_apic = -1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + } + finish_e820_parsing(); probe_roms(); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index a93300d..175c696 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -302,6 +302,11 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (acpi_mps_check()) { + disable_apic = 1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + } + #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); @@ -723,6 +728,10 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); + + /* early_param could clear that, but recall get it set again */ + if (disable_apic) + clear_cpu_cap(c, X86_FEATURE_APIC); } /* diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index 313bcaf..9fe941c 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -39,8 +39,12 @@ extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int ioapic_force; -extern int disable_apic; +#ifdef CONFIG_X86_64 +extern int disable_apic; +#else +extern int enable_local_apic; +#endif /* * Basic functions accessing APICs. */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 41f7ce7..0601075 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -82,6 +82,7 @@ char * __acpi_map_table (unsigned long phys_addr, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); int acpi_boot_table_init (void); +int acpi_mps_check (void); int acpi_numa_init (void); int acpi_table_init (void); @@ -250,6 +251,11 @@ static inline int acpi_boot_table_init(void) return 0; } +static inline int acpi_mps_check(void) +{ + return 0; +} + static inline int acpi_check_resource_conflict(struct resource *res) { return 0; -- cgit v1.1 From ce38cc79964687d3c7e92663bc040552416fca27 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 01:14:27 -0700 Subject: x86: clean up init_amd() 1. move out calling of check_enable_amd_mmconf_dmi out of setup_64.c put it into init_amd(), so don't need to make extra dmi check for system with other cpus. 2. 15 --> 0xf Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/amd_64.c | 22 ++++++++++++++-------- arch/x86/kernel/setup_64.c | 4 ---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 30b7557..958526d 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -131,7 +131,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 15) { + if (c->x86 == 0xf) { rdmsrl(MSR_K8_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K8_HWCR, value); @@ -143,10 +143,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) clear_cpu_cap(c, 0*32+31); /* On C+ stepping K8 rep microcode works well for copy/memset */ - level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || - level >= 0x0f58)) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); + if (c->x86 == 0xf) { + level = cpuid_eax(1); + if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); @@ -157,7 +158,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = get_model_name(c); if (!level) { switch (c->x86) { - case 15: + case 0xf: /* Should distinguish Models here, but this is only a fallback anyways. */ strcpy(c->x86_model_id, "Hammer"); @@ -176,14 +177,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) else num_cache_leaves = 3; - if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0xf && c->x86 <= 0x11) set_cpu_cap(c, X86_FEATURE_K8); /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - if (c->x86 == 0x10) + if (c->x86 == 0x10) { + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + fam10h_check_enable_mmcfg(); + } if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { unsigned long long tseg; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 175c696..c94464a 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -72,7 +72,6 @@ #include <asm/topology.h> #include <asm/trampoline.h> #include <asm/pat.h> -#include <asm/mmconfig.h> #include <mach_apic.h> #ifdef CONFIG_PARAVIRT @@ -474,9 +473,6 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif - - /* do this before identify_cpu for boot cpu */ - check_enable_amd_mmconf_dmi(); } struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -- cgit v1.1 From 04606618bb50c4ec754585a82732ea4facfe2bc9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 01:38:41 -0700 Subject: x86: remove some acpi ifdefs in setup_32/64 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 5 +---- arch/x86/kernel/setup_64.c | 4 ---- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index cad4e89..e274ee6 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -726,12 +726,10 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); -#ifdef CONFIG_ACPI /* * Parse the ACPI tables for possible boot-time SMP configuration. */ acpi_boot_table_init(); -#endif #ifdef CONFIG_ACPI_NUMA /* @@ -812,9 +810,8 @@ void __init setup_arch(char **cmdline_p) early_quirks(); -#ifdef CONFIG_ACPI acpi_boot_init(); -#endif + #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) if (smp_found_config) get_smp_config(); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c94464a..9b516ee 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -355,13 +355,11 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). * Call this early for SRAT node setup. */ acpi_boot_table_init(); -#endif /* How many end-of-memory variables you have, grandma! */ max_low_pfn = end_pfn; @@ -432,12 +430,10 @@ void __init setup_arch(char **cmdline_p) early_quirks(); -#ifdef CONFIG_ACPI /* * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); -#endif init_cpu_to_node(); -- cgit v1.1 From f580366f77cc4e035a68369105fbeae5bf436b4c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 03:24:19 -0700 Subject: x86: seperate funcs from setup_64 to cpu common_64.c Signed-off-by: Yinghai Lu <yhlu.kernel@mail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common_64.c | 406 ++++++++++++++++++++ arch/x86/kernel/setup_64.c | 823 ---------------------------------------- include/asm-x86/processor.h | 1 + 4 files changed, 408 insertions(+), 824 deletions(-) create mode 100644 arch/x86/kernel/cpu/common_64.c delete mode 100644 arch/x86/kernel/setup_64.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 65b1be5..ee76eaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -6,7 +6,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o -obj-$(CONFIG_X86_64) += bugs_64.o +obj-$(CONFIG_X86_64) += common_64.o bugs_64.o obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o obj-$(CONFIG_X86_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c new file mode 100644 index 0000000..b6f2050 --- /dev/null +++ b/arch/x86/kernel/cpu/common_64.c @@ -0,0 +1,406 @@ +#include <linux/init.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/bootmem.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/mtrr.h> +#include <asm/mce.h> +#include <asm/pat.h> +#include <asm/numa.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#endif + +#include "cpu.h" + +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} + +struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ + display_cacheinfo(c); +} + +static struct cpu_dev __cpuinitdata default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", +}; +static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; + +int __cpuinit get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + + if (c->extended_cpuid_level < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + return 1; +} + + +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, eax, ebx, ecx, edx; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; + } + + if (n >= 0x80000006) { + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + ecx = cpuid_ecx(0x80000006); + c->x86_cache_size = ecx >> 16; + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + c->x86_cache_size, ecx & 0xFF); + } + if (n >= 0x80000008) { + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +} + +void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id(index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); + } +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } + +#endif +} + +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + int i; + static int printed; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + this_cpu = cpu_devs[i]; + return; + } + } + } + if (!printed) { + printed++; + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + c->x86_vendor = X86_VENDOR_UNKNOWN; +} + +static void __init early_cpu_support_print(void) +{ + int i,j; + struct cpu_dev *cpu_devx; + + printk("KERNEL supported cpus:\n"); + for (i = 0; i < X86_VENDOR_NUM; i++) { + cpu_devx = cpu_devs[i]; + if (!cpu_devx) + continue; + for (j = 0; j < 2; j++) { + if (!cpu_devx->c_ident[j]) + continue; + printk(" %s %s\n", cpu_devx->c_vendor, + cpu_devx->c_ident[j]); + } + } +} + +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + +void __init early_cpu_init(void) +{ + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start ; + cvdev < __x86cpuvendor_end ; + cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + early_cpu_support_print(); + early_identify_cpu(&boot_cpu_data); +} + +/* Do some early cpuid on the boot CPU to get some parameter that are + needed before check_bugs. Everything advanced is in identify_cpu + below. */ +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; + c->extended_cpuid_level = 0; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c); + + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + __u32 misc; + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], + &c->x86_capability[0]); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; +#ifdef CONFIG_SMP + c->phys_proc_id = c->initial_apicid; +#endif + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if (xlvl >= 0x80000004) + get_model_name(c); /* Default name */ + } + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + /* Don't set x86_cpuid_level here for now to not confuse. */ + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } + + c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + if (c->x86_vendor != X86_VENDOR_UNKNOWN && + cpu_devs[c->x86_vendor]->c_early_init) + cpu_devs[c->x86_vendor]->c_early_init(c); + + validate_pat_support(c); + + /* early_param could clear that, but recall get it set again */ + if (disable_apic) + clear_cpu_cap(c, X86_FEATURE_APIC); +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + early_identify_cpu(c); + + init_scattered_cpuid_features(c); + + c->apicid = phys_pkg_id(0); + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + detect_ht(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] &= ~cleared_cpu_caps[i]; + +#ifdef CONFIG_X86_MCE + mcheck_init(c); +#endif + select_idle_routine(c); + +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif + +} + +void __cpuinit identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); +} + +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + mtrr_ap_init(); +} + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +{ + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); + else + printk(KERN_CONT "\n"); +} + +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c deleted file mode 100644 index 9b516ee..0000000 --- a/arch/x86/kernel/setup_64.c +++ /dev/null @@ -1,823 +0,0 @@ -/* - * Copyright (C) 1995 Linus Torvalds - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/screen_info.h> -#include <linux/ioport.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/initrd.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <linux/console.h> -#include <linux/seq_file.h> -#include <linux/crash_dump.h> -#include <linux/root_dev.h> -#include <linux/pci.h> -#include <asm/pci-direct.h> -#include <linux/efi.h> -#include <linux/acpi.h> -#include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/iscsi_ibft.h> -#include <linux/mmzone.h> -#include <linux/kexec.h> -#include <linux/cpufreq.h> -#include <linux/dmi.h> -#include <linux/dma-mapping.h> -#include <linux/ctype.h> -#include <linux/sort.h> -#include <linux/uaccess.h> -#include <linux/init_ohci1394_dma.h> -#include <linux/kvm_para.h> - -#include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/vsyscall.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <video/edid.h> -#include <asm/e820.h> -#include <asm/mpspec.h> -#include <asm/dma.h> -#include <asm/gart.h> -#include <asm/mpspec.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/setup.h> -#include <asm/numa.h> -#include <asm/sections.h> -#include <asm/dmi.h> -#include <asm/cacheflush.h> -#include <asm/mce.h> -#include <asm/ds.h> -#include <asm/topology.h> -#include <asm/trampoline.h> -#include <asm/pat.h> - -#include <mach_apic.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#else -#define ARCH_SETUP -#endif - -/* - * Machine setup.. - */ - -struct cpuinfo_x86 boot_cpu_data __read_mostly; -EXPORT_SYMBOL(boot_cpu_data); - -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - -unsigned long mmu_cr4_features; - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -unsigned long saved_video_mode; - -/* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - -/* - * Setup options - */ -struct screen_info screen_info; -EXPORT_SYMBOL(screen_info); -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; - -struct edid_info edid_info; -EXPORT_SYMBOL_GPL(edid_info); - -extern int root_mountflags; - -static char __initdata command_line[COMMAND_LINE_SIZE]; - -static struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x60, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x64, .end = 0x64, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) - -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; -static struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; - -static void __init early_cpu_init(void); -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); - -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - -#ifndef CONFIG_NUMA -static void __init -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long bootmap_size, bootmap; - - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, - PAGE_SIZE); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n", bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_register_active_regions(0, start_pfn, end_pfn); - free_bootmem_with_active_regions(0, end_pfn); - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); -} -#endif - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, - sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; - edd.edd_info_nr = boot_params.eddbuf_entries; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -void __attribute__((weak)) __init memory_setup(void) -{ - machine_specific_memory_setup(); -} - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -} - -/* - * setup_arch - architecture-specific boot-time initializations - * - * Note: On x86_64, fixmaps are ready for use even before this is called. - */ -void __init setup_arch(char **cmdline_p) -{ - unsigned i; - - printk(KERN_INFO "Command line: %s\n", boot_command_line); - - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL64", 4)) { - efi_enabled = 1; - efi_reserve_early(); - } -#endif - - ARCH_SETUP - - setup_memory_map(); - copy_edd(); - - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; - init_mm.end_data = (unsigned long) &_edata; - init_mm.brk = (unsigned long) &_end; - - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; - - early_cpu_init(); - early_identify_cpu(&boot_cpu_data); - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_setup_data(); - - parse_early_param(); - - if (acpi_mps_check()) { - disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - } - -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - if (init_ohci1394_dma_early) - init_ohci1394_dma_on_all_controllers(); -#endif - - finish_e820_parsing(); - - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); - - early_gart_iommu_check(); - - e820_register_active_regions(0, 0, -1UL); - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - end_pfn = e820_end_of_ram(); - - /* pre allocte 4k for mptable mpc */ - early_reserve_e820_mpc_new(); - /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); - if (mtrr_trim_uncached_memory(end_pfn)) { - remove_all_active_ranges(); - e820_register_active_regions(0, 0, -1UL); - end_pfn = e820_end_of_ram(); - } - - num_physpages = end_pfn; - - check_efer(); - - max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT)); - if (efi_enabled) - efi_init(); - - vsmp_init(); - - dmi_scan_machine(); - - io_delay_init(); - -#ifdef CONFIG_KVM_CLOCK - kvmclock_init(); -#endif - - /* - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). - * Call this early for SRAT node setup. - */ - acpi_boot_table_init(); - - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; - - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); - -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - -#ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); -#else - contig_initmem_init(0, end_pfn); -#endif - - dma32_reserve_bootmem(); - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - -#ifdef CONFIG_X86_MPPARSE - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif -#ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_mem = end_pfn << PAGE_SHIFT; - - if (ramdisk_end <= end_of_mem) { - /* - * don't need to reserve again, already reserved early - * in x86_64_start_kernel, and early_res_to_bootmem - * convert that to reserved in bootmem - */ - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; - } else { - free_bootmem(ramdisk_image, ramdisk_size); - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_mem); - initrd_start = 0; - } - } -#endif - reserve_crashkernel(); - - reserve_ibft_region(); - - paging_init(); - map_vsyscall(); - - early_quirks(); - - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); - - init_cpu_to_node(); - -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); -#endif - init_apic_mappings(); - ioapic_init_mappings(); - - kvm_guest_init(); - - /* - * We trust e820 completely. No explicit ROM probing in memory. - */ - e820_reserve_resources(); - e820_mark_nosave_regions(end_pfn); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - - e820_setup_gap(); - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif -} - -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; - -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ - display_cacheinfo(c); -} - -static struct cpu_dev __cpuinitdata default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", -}; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; - -int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, eax, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " - "D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24) + (edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } -} - -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); - } -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } - -#endif -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - int i; - static int printed; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } - } - } - if (!printed) { - printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - -static void __init early_cpu_init(void) -{ - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_support_print(); -} - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; -#endif - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - c->extended_cpuid_level = cpuid_eax(0x80000000); - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); - - validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - early_identify_cpu(c); - - init_scattered_cpuid_features(c); - - c->apicid = phys_pkg_id(0); - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0; i < NCAPINTS; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - select_idle_routine(c); - -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif - -} - -void __cpuinit identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); -} - -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); - mtrr_ap_init(); -} - -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); - else - printk(KERN_CONT "\n"); -} - -static __init int setup_disablecpuid(char *arg) -{ - int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; -} -__setup("clearcpuid=", setup_disablecpuid); diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 4ab2ede..a5df34f 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -153,6 +153,7 @@ static inline int hlt_works(int cpu) extern void cpu_detect(struct cpuinfo_x86 *c); +extern void early_cpu_init(void); extern void identify_cpu(struct cpuinfo_x86 *); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); -- cgit v1.1 From 9a250347591da3e60b5ee53dd1d341732f081117 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 03:24:00 -0700 Subject: x86: change identify_cpu to static Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/common_64.c | 2 +- include/asm-x86/processor.h | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d0463a9..80ab20d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -427,7 +427,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index b6f2050..48ba799 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -312,7 +312,7 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index a5df34f..12b5ede 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -154,7 +154,6 @@ static inline int hlt_works(int cpu) extern void cpu_detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_cpu(struct cpuinfo_x86 *); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); -- cgit v1.1 From 7a1fd9866cbb59a00006f1e0fd5726951b167c97 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 14:48:05 -0700 Subject: x86: add e820_remove_range ... so could add real hole in e820 agp check is using request_mem_region, and could fail if e820 is reserved... Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++ include/asm-x86/e820.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7b613d2..e285ea3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -429,6 +429,41 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, return real_updated_size; } +/* make e820 not cover the range */ +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype) +{ + int i; + u64 real_removed_size = 0; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + + if (checktype && ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && + (ei->addr + ei->size) <= (start + size)) { + real_removed_size += ei->size; + memset(ei, 0, sizeof(struct e820entry)); + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + real_removed_size += final_end - final_start; + + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_removed_size; +} + void __init update_e820(void) { int nr_map; diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 0e92b6a..7c32df0 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -67,6 +67,8 @@ sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type); +extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype); extern void update_e820(void); extern void e820_setup_gap(void); struct setup_data; -- cgit v1.1 From a9c1182fbd349882fe912245d6e03cd30943be2d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 15:39:41 -0700 Subject: x86: seperate probe_roms into another file it is only needed for 32bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/probe_roms_32.c | 166 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 146 ----------------------------------- include/asm-x86/setup.h | 1 + 4 files changed, 168 insertions(+), 146 deletions(-) create mode 100644 arch/x86/kernel/probe_roms_32.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9a71be8..9c9ce75 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -19,6 +19,7 @@ obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o +obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c new file mode 100644 index 0000000..675a48c --- /dev/null +++ b/arch/x86/kernel/probe_roms_32.c @@ -0,0 +1,166 @@ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/mmzone.h> +#include <linux/ioport.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/dmi.h> +#include <linux/pfn.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> + + +#include <asm/e820.h> +#include <asm/mmzone.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/io.h> +#include <setup_arch.h> + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index e274ee6..865838b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -606,8 +606,6 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif -static void probe_roms(void); - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -843,147 +841,3 @@ void __init setup_arch(char **cmdline_p) #endif } -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -#define ROMSIGNATURE 0xaa55 - -static int __init romsignature(const unsigned char *rom) -{ - const unsigned short * const ptr = (const unsigned short *)rom; - unsigned short sig; - - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; -} - -static int __init romchecksum(const unsigned char *rom, unsigned long length) -{ - unsigned char sum, c; - - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) - sum += c; - return !length && !sum; -} - -static void __init probe_roms(void) -{ - const unsigned char *rom; - unsigned long start, length, upper; - unsigned char c; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index b434bdd..cf87d6d 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -54,6 +54,7 @@ extern struct boot_params boot_params; #ifdef __i386__ void __init i386_start_kernel(void); +extern void probe_roms(void); extern unsigned long init_pg_tables_start; extern unsigned long init_pg_tables_end; -- cgit v1.1 From 0f0124fa742da7c51e2e3c5ded7f5e5e06ddc195 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 16:25:37 -0700 Subject: x86: merge setup64.c into common_64.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/cpu/common_64.c | 277 +++++++++++++++++++++++++++++++++++++- arch/x86/kernel/setup64.c | 287 ---------------------------------------- 3 files changed, 277 insertions(+), 289 deletions(-) delete mode 100644 arch/x86/kernel/setup64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9c9ce75..0a1987b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 48ba799..9fb5b7c 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -1,10 +1,17 @@ #include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/kgdb.h> +#include <linux/topology.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/module.h> #include <linux/percpu.h> -#include <linux/bootmem.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/msr.h> @@ -19,6 +26,15 @@ #include <asm/apic.h> #include <mach_apic.h> #endif +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/proto.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/genapic.h> #include "cpu.h" @@ -404,3 +420,262 @@ static __init int setup_disablecpuid(char *arg) return 1; } __setup("clearcpuid=", setup_disablecpuid); + +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +int force_personality32; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes (default) +off PROT_READ implies PROT_EXEC +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +void pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } + + pda->irqstackptr += IRQSTACKSIZE-64; +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] +__attribute__((section(".bss.page_aligned"))); + +extern asmlinkage void ignore_sysret(void); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +} + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +unsigned long kernel_eflags; + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ +void __cpuinit cpu_init(void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); +} diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c deleted file mode 100644 index 151d315..0000000 --- a/arch/x86/kernel/setup64.c +++ /dev/null @@ -1,287 +0,0 @@ -/* - * X86-64 specific CPU setup. - * Copyright (C) 1995 Linus Torvalds - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. - * See setup.c for older changelog. - */ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <linux/kgdb.h> -#include <linux/topology.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/mmu_context.h> -#include <asm/smp.h> -#include <asm/i387.h> -#include <asm/percpu.h> -#include <asm/proto.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/genapic.h> - -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata = 0; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32 = 0; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to set CS/DS - * but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init (); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) { - __supported_pte_mask &= ~_PAGE_NX; - } -} - -unsigned long kernel_eflags; - -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init (void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) { - pda_init(cpu); - } else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk("Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); - load_idt((const struct desc_ptr *)&idt_descr); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - raw_local_save_flags(kernel_eflags); - - if (is_uv_system()) - uv_cpu_init(); -} -- cgit v1.1 From f81be876eaa9c71b3024c3dc05e4d1bf210cc255 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 19:16:52 -0700 Subject: x86: remove two duplicated funcs in setup_32.c early_cpu_init is declared in processor.h memory_setup is defined in e820.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 865838b..602a45c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -206,7 +206,6 @@ struct ist_info ist_info; EXPORT_SYMBOL(ist_info); #endif -extern void early_cpu_init(void); extern int root_mountflags; unsigned long saved_video_mode; -- cgit v1.1 From ce97c40e28223c148e142bda7af48fd0f27c81f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 20:22:09 -0700 Subject: x86: move reserve_standard_io_resource to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 32 ++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 57 +--------------------------------------------- include/asm-x86/setup.h | 2 ++ 3 files changed, 35 insertions(+), 56 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 56aee55..3b9ec81 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -468,4 +468,36 @@ void __init reserve_crashkernel(void) void __init reserve_crashkernel(void) {} #endif +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 602a45c..2574ec8 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -108,58 +108,6 @@ static struct resource video_ram_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x0060, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0064, - .end = 0x0064, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; /* common cpu data for all cpus */ @@ -614,7 +562,6 @@ static void set_mca_bus(int x) { } */ void __init setup_arch(char **cmdline_p) { - int i; unsigned long max_low_pfn; memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -824,9 +771,7 @@ void __init setup_arch(char **cmdline_p) e820_mark_nosave_regions(max_low_pfn); request_resource(&iomem_resource, &video_ram_resource); - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); + reserve_standard_io_resources(); e820_setup_gap(); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index cf87d6d..bb12a16 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -38,6 +38,8 @@ void reserve_crashkernel(void); #ifndef __ASSEMBLY__ #include <asm/bootparam.h> +void reserve_standard_io_resources(void); + #ifndef _SETUP /* -- cgit v1.1 From 17b4cceb1feb2a8865ce47064dd3bd446063a5d5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 21 Jun 2008 21:02:20 -0700 Subject: x86: move elfcorehdr parsing to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 19 +++++++++++++++++++ arch/x86/kernel/setup_32.c | 16 ---------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3b9ec81..5497fb9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -4,6 +4,7 @@ #include <linux/bootmem.h> #include <linux/percpu.h> #include <linux/kexec.h> +#include <linux/crash_dump.h> #include <asm/smp.h> #include <asm/percpu.h> #include <asm/sections.h> @@ -501,3 +502,21 @@ void __init reserve_standard_io_resources(void) } +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); +#endif + + + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2574ec8..d24ac26 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -42,7 +42,6 @@ #include <linux/iscsi_ibft.h> #include <linux/nodemask.h> #include <linux/kexec.h> -#include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> #include <linux/pci.h> @@ -194,21 +193,6 @@ static inline void copy_edd(void) } #endif -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ -static int __init parse_elfcorehdr(char *arg) -{ - if (!arg) - return -EINVAL; - - elfcorehdr_addr = memparse(arg, &arg); - return 0; -} -early_param("elfcorehdr", parse_elfcorehdr); -#endif /* CONFIG_PROC_VMCORE */ - /* * highmem=size forces highmem to be exactly 'size' bytes. * This works even on boxes that have no highmem otherwise. -- cgit v1.1 From 1f75d7e32ed47b2ab8570771a2ce8c707a7225a2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 22 Jun 2008 02:44:49 -0700 Subject: x86: introduce initmem_init for 64 bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 16 ++++++++++++++++ arch/x86/mm/numa_64.c | 2 +- include/asm-x86/numa_64.h | 1 - include/asm-x86/page_64.h | 1 + 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 97c2bc7..99a091e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -611,6 +611,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon } #ifndef CONFIG_NUMA +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long bootmap_size, bootmap; + + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, + PAGE_SIZE); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n", bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_register_active_regions(0, start_pfn, end_pfn); + free_bootmem_with_active_regions(0, end_pfn); + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); +} + void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c4557e2..316e5f9 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -514,7 +514,7 @@ out: } #endif /* CONFIG_NUMA_EMU */ -void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn) +void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) { int i; diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h index b510daf..3830094 100644 --- a/include/asm-x86/numa_64.h +++ b/include/asm-x86/numa_64.h @@ -22,7 +22,6 @@ extern int hotadd_percent; extern s16 apicid_to_node[MAX_LOCAL_APIC]; -extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long numa_free_all_bootmem(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index 6ea7285..ac37643 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -83,6 +83,7 @@ typedef struct { pteval_t pte; } pte_t; extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM -- cgit v1.1 From b2ac82a0909aea0d2620ba4c189f37c567c21fe5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 22 Jun 2008 02:45:39 -0700 Subject: x86: introduce initmem_init for 32 bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 94 +--------------------------------------------- arch/x86/mm/discontig_32.c | 4 +- arch/x86/mm/init_32.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/page_32.h | 5 +++ 4 files changed, 99 insertions(+), 94 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index d24ac26..190546b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -300,71 +300,11 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -static void __init setup_bootmem_allocator(void); -static unsigned long __init setup_memory(void) -{ - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); - - max_low_pfn = find_max_low_pfn(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) { - highstart_pfn = max_low_pfn; - } - memory_present(0, 0, highend_pfn); - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - memory_present(0, 0, max_low_pfn); - num_physpages = max_low_pfn; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif -#ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - - setup_bootmem_allocator(); - - return max_low_pfn; -} - -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - remove_all_active_ranges(); -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - e820_register_active_regions(0, 0, highend_pfn); -#else - e820_register_active_regions(0, 0, max_low_pfn); -#endif - - free_area_init_nodes(max_zone_pfns); -} -#else -extern unsigned long __init setup_memory(void); -extern void zone_sizes_init(void); -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ - #ifdef CONFIG_BLK_DEV_INITRD static bool do_relocate_initrd = false; -static void __init reserve_initrd(void) +void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; @@ -480,36 +420,6 @@ static void __init relocate_initrd(void) #endif /* CONFIG_BLK_DEV_INITRD */ -void __init setup_bootmem_allocator(void) -{ - int i; - unsigned long bootmap_size, bootmap; - /* - * Initialize the boot-time allocator (with low memory only): - */ - bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, - max_pfn_mapped<<PAGE_SHIFT, bootmap_size, - PAGE_SIZE); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n", bootmap_size); - reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); -#ifdef CONFIG_BLK_DEV_INITRD - reserve_initrd(); -#endif - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); - printk(KERN_INFO " mapped low ram: 0 - %08lx\n", - max_pfn_mapped<<PAGE_SHIFT); - printk(KERN_INFO " low ram: %08lx - %08lx\n", - min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); - printk(KERN_INFO " bootmap %08lx - %08lx\n", - bootmap, bootmap + bootmap_size); - for_each_online_node(i) - free_bootmem_with_active_regions(i, max_low_pfn); - early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); - -} - /* * The node 0 pgdat is initialized before all of these because * it's needed for bootmem. node>0 pgdats have their virtual @@ -666,7 +576,7 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif - max_low_pfn = setup_memory(); + max_low_pfn = initmem_init(0, max_pfn); #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a2f73ba..3e75be4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,8 +309,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) { int nid; unsigned long system_start_pfn, system_max_low_pfn; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a0484ad..9bc8607 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -540,6 +540,96 @@ static void __init set_nx(void) } #endif +#ifndef CONFIG_NEED_MULTIPLE_NODES +extern unsigned long find_max_low_pfn(void); +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + memory_present(0, 0, highend_pfn); + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + memory_present(0, 0, max_low_pfn); + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + remove_all_active_ranges(); +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + e820_register_active_regions(0, 0, highend_pfn); +#else + e820_register_active_regions(0, 0, max_low_pfn); +#endif + + free_area_init_nodes(max_zone_pfns); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +extern void reserve_initrd(void); + +void __init setup_bootmem_allocator(void) +{ + int i; + unsigned long bootmap_size, bootmap; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, + max_pfn_mapped<<PAGE_SHIFT, bootmap_size, + PAGE_SIZE); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n", bootmap_size); + reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); +#ifdef CONFIG_BLK_DEV_INITRD + reserve_initrd(); +#endif + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + printk(KERN_INFO " low ram: %08lx - %08lx\n", + min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); + printk(KERN_INFO " bootmap %08lx - %08lx\n", + bootmap, bootmap + bootmap_size); + for_each_online_node(i) + free_bootmem_with_active_regions(i, max_low_pfn); + early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); + +} + + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 73ed2e4..a0713d1 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -92,6 +92,11 @@ extern int sysctl_legacy_va_layout; #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define MAXMEM (-__PAGE_OFFSET - __VMALLOC_RESERVE) +extern unsigned long initmem_init(unsigned long, unsigned long); +extern void zone_sizes_init(void); +extern void setup_bootmem_allocator(void); + + #ifdef CONFIG_X86_USE_3DNOW #include <asm/mmx.h> -- cgit v1.1 From 225c37d71bc8b97eb2063e8eda153b383328b20b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 22 Jun 2008 02:46:58 -0700 Subject: x86: introduce reserve_initrd Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 16 ++++++++++------ arch/x86/mm/init_32.c | 6 ++---- include/asm-x86/setup.h | 2 ++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 190546b..90b5104 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -336,7 +336,7 @@ void __init reserve_initrd(void) * in i386_start_kernel */ initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; + initrd_end = initrd_start + ramdisk_size; return; } @@ -363,7 +363,7 @@ void __init reserve_initrd(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -static void __init relocate_initrd(void) +static void __init post_reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; @@ -417,7 +417,13 @@ static void __init relocate_initrd(void) /* need to free that, otherwise init highmem will reserve it again */ free_early(ramdisk_image, ramdisk_image+ramdisk_size); } - +#else +void __init reserve_initrd(void) +{ +} +static void __init post_reserve_initrd(void) +{ +} #endif /* CONFIG_BLK_DEV_INITRD */ /* @@ -632,9 +638,7 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ -#ifdef CONFIG_BLK_DEV_INITRD - relocate_initrd(); -#endif + post_reserve_initrd(); remapped_pgdat_init(); sparse_init(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9bc8607..9808078 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -597,8 +597,6 @@ void __init zone_sizes_init(void) } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -extern void reserve_initrd(void); - void __init setup_bootmem_allocator(void) { int i; @@ -613,9 +611,9 @@ void __init setup_bootmem_allocator(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); -#ifdef CONFIG_BLK_DEV_INITRD + reserve_initrd(); -#endif + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index bb12a16..8f85b24 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -39,6 +39,8 @@ void reserve_crashkernel(void); #include <asm/bootparam.h> void reserve_standard_io_resources(void); +void reserve_initrd(void); + #ifndef _SETUP -- cgit v1.1 From 7f0be02c5ed1deb04c54c6a17f412e04f417df11 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 22 Jun 2008 17:37:54 -0700 Subject: x86: move boot_params declaring to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/common_64.c | 6 ------ arch/x86/kernel/setup.c | 6 ++++++ arch/x86/kernel/setup_32.c | 6 ------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 9fb5b7c..39eaefb 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -421,12 +421,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda **_cpu_pda __read_mostly; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5497fb9..5c0c4bb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -15,6 +15,12 @@ #include <asm/apicdef.h> #include <asm/highmem.h> +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 90b5104..3149f43 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -163,12 +163,6 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE -- cgit v1.1 From 90d967e0ef68f5312ed4b081d5c9312ff53c1c93 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 21:00:45 +0200 Subject: x86: move find_max_low_pfn to init_32.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 77 --------------------------------------- arch/x86/mm/init_32.c | 89 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 78 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 3149f43..1315500 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -129,9 +129,6 @@ unsigned int BIOS_revision; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; -/* user-defined highmem size */ -static unsigned int highmem_pages = -1; - /* * Early DMI memory */ @@ -188,21 +185,6 @@ static inline void copy_edd(void) #endif /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ -static int __init parse_highmem(char *arg) -{ - if (!arg) - return -EINVAL; - - highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; - return 0; -} -early_param("highmem", parse_highmem); - -/* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the * vmalloc area - the default is 128m. @@ -235,65 +217,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); -/* - * Determine low and high memory ranges: - */ -unsigned long __init find_max_low_pfn(void) -{ - unsigned long max_low_pfn; - - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn = MAXMEM_PFN; -#ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; -#else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_HIGHMEM64G - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - } -#endif /* !CONFIG_HIGHMEM64G */ -#endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); -#endif - } - return max_low_pfn; -} - #ifdef CONFIG_BLK_DEV_INITRD static bool do_relocate_initrd = false; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9808078..d101733 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -540,8 +540,95 @@ static void __init set_nx(void) } #endif +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +/* + * Determine low and high memory ranges: + */ +unsigned long __init find_max_low_pfn(void) +{ + unsigned long max_low_pfn; + + max_low_pfn = max_pfn; + if (max_low_pfn > MAXMEM_PFN) { + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING "only %luMB highmem pages " + "available, ignoring highmem size of %uMB.\n", + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", + MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING + "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING "Warning only 4GB will be used." + "Use a HIGHMEM64G enabled kernel.\n"); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ + } else { + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR "highmem size specified (%uMB) is " + "bigger than pages available (%luMB)!.\n", + pages_to_mb(highmem_pages), + pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < + 64*1024*1024/PAGE_SIZE){ + printk(KERN_ERR "highmem size %uMB results in " + "smaller than 64MB lowmem, ignoring it.\n" + , pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem" + " kernel!\n"); +#endif + } + return max_low_pfn; +} + #ifndef CONFIG_NEED_MULTIPLE_NODES -extern unsigned long find_max_low_pfn(void); unsigned long __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { -- cgit v1.1 From bef1568d9714f1162086c32583ba7984a7ca8e3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 22 Jun 2008 17:40:10 -0700 Subject: x86: move reservetop and vmalloc parsing to pgtable_32.c also change reserve_top_address to __init attibute Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 33 --------------------------------- arch/x86/mm/pgtable_32.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 1315500..9a08490 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -184,39 +184,6 @@ static inline void copy_edd(void) } #endif -/* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ -static int __init parse_vmalloc(char *arg) -{ - if (!arg) - return -EINVAL; - - __VMALLOC_RESERVE = memparse(arg, &arg); - return 0; -} -early_param("vmalloc", parse_vmalloc); - -/* - * reservetop=size reserves a hole at the top of the kernel address space which - * a hypervisor can load into later. Needed for dynamically loaded hypervisors, - * so relocating the fixmap can be done before paging initialization. - */ -static int __init parse_reservetop(char *arg) -{ - unsigned long address; - - if (!arg) - return -EINVAL; - - address = memparse(arg, &arg); - reserve_top_address(address); - return 0; -} -early_param("reservetop", parse_reservetop); - #ifdef CONFIG_BLK_DEV_INITRD static bool do_relocate_initrd = false; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0662f34..828907d 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL(__FIXADDR_TOP); * Can be used to relocate the fixmap area and poke a hole in the top * of kernel address space to make room for a hypervisor. */ -void reserve_top_address(unsigned long reserve) +void __init reserve_top_address(unsigned long reserve) { BUG_ON(fixmaps_set > 0); printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", @@ -160,3 +160,36 @@ void reserve_top_address(unsigned long reserve) __FIXADDR_TOP = -reserve - PAGE_SIZE; __VMALLOC_RESERVE += reserve; } + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; +} +early_param("vmalloc", parse_vmalloc); + +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + return 0; +} +early_param("reservetop", parse_reservetop); -- cgit v1.1 From 2ec65f8b89ea003c27ff7723525a2ee335a2b393 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 03:05:30 -0700 Subject: x86: clean up using max_low_pfn on 32-bit so that max_low_pfn is not changed after it is set. so we can move that early and out of initmem_init. could call find_low_pfn_range just after max_pfn is set. also could move reserve_initrd out of setup_bootmem_allocator so 32bit is more like 64bit. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 16 ++++++++++------ arch/x86/mm/discontig_32.c | 22 +++++++--------------- arch/x86/mm/init_32.c | 25 +++++++++---------------- include/asm-x86/page_32.h | 3 ++- include/asm-x86/setup.h | 2 -- 5 files changed, 28 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9a08490..b42f570 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -188,13 +188,14 @@ static inline void copy_edd(void) static bool do_relocate_initrd = false; -void __init reserve_initrd(void) +static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; u64 ramdisk_here; + u64 ramdisk_target; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -202,7 +203,7 @@ void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= end_of_lowmem/2) { + if (ramdisk_size >= (end_of_lowmem>>1)) { free_early(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); @@ -225,7 +226,8 @@ void __init reserve_initrd(void) } /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT, + ramdisk_target = max_pfn_mapped<<PAGE_SHIFT; + ramdisk_here = find_e820_area(min(ramdisk_target, end_of_lowmem>>1), end_of_lowmem, ramdisk_size, PAGE_SIZE); @@ -346,8 +348,6 @@ static void set_mca_bus(int x) { } */ void __init setup_arch(char **cmdline_p) { - unsigned long max_low_pfn; - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); @@ -450,6 +450,10 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820_end_of_ram(); } + find_low_pfn_range(); + + reserve_initrd(); + dmi_scan_machine(); io_delay_init(); @@ -466,7 +470,7 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif - max_low_pfn = initmem_init(0, max_pfn); + initmem_init(0, max_pfn); #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3e75be4..1dfff70 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,11 +309,10 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int nid; - unsigned long system_start_pfn, system_max_low_pfn; long kva_target_pfn; /* @@ -324,17 +323,11 @@ unsigned long __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - /* call find_max_low_pfn at first, it could update max_pfn */ - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, @@ -357,19 +350,19 @@ unsigned long __init initmem_init(unsigned long start_pfn, "KVA PG"); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > system_max_low_pfn) - highstart_pfn = system_max_low_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = system_max_low_pfn; - high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + pages_to_mb(max_low_pfn)); + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); printk("Low memory ends at vaddr %08lx\n", @@ -387,7 +380,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); - return max_low_pfn; } void __init zone_sizes_init(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d101733..27b8293 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -561,9 +561,15 @@ early_param("highmem", parse_highmem); /* * Determine low and high memory ranges: */ -unsigned long __init find_max_low_pfn(void) +void __init find_low_pfn_range(void) { - unsigned long max_low_pfn; + /* it could update max_pfn */ + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { @@ -625,21 +631,12 @@ unsigned long __init find_max_low_pfn(void) " kernel!\n"); #endif } - return max_low_pfn; } #ifndef CONFIG_NEED_MULTIPLE_NODES -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); - - max_low_pfn = find_max_low_pfn(); - #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -661,8 +658,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); - - return max_low_pfn; } void __init zone_sizes_init(void) @@ -699,8 +694,6 @@ void __init setup_bootmem_allocator(void) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - reserve_initrd(); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index a0713d1..3810d14 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -92,7 +92,8 @@ extern int sysctl_legacy_va_layout; #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define MAXMEM (-__PAGE_OFFSET - __VMALLOC_RESERVE) -extern unsigned long initmem_init(unsigned long, unsigned long); +extern void find_low_pfn_range(void); +extern void initmem_init(unsigned long, unsigned long); extern void zone_sizes_init(void); extern void setup_bootmem_allocator(void); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 8f85b24..bb12a16 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -39,8 +39,6 @@ void reserve_crashkernel(void); #include <asm/bootparam.h> void reserve_standard_io_resources(void); -void reserve_initrd(void); - #ifndef _SETUP -- cgit v1.1 From 346cafecdeb17e1a0457a9e7eca239ef467b678c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 03:06:14 -0700 Subject: x86: clean up min_low_pfn for 32bit we already had early_res support, so don't need to track min_low_pfn. keep it to 0 always. also use init_bootmem_node instead of init_bootmem, so don't touch min_low_pfn. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 10 ++++------ arch/x86/mm/init_64.c | 4 +++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27b8293..9bb35cf 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -565,11 +565,7 @@ void __init find_low_pfn_range(void) { /* it could update max_pfn */ - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); + /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { @@ -694,7 +690,9 @@ void __init setup_bootmem_allocator(void) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, + min_low_pfn, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: %08lx - %08lx\n", diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 99a091e..955dbc8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -620,7 +620,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, + 0, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); -- cgit v1.1 From 3eb11edc1321e14a121deeb8b18c177750226eca Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 23 Jun 2008 22:19:22 +0200 Subject: x86: build fix fix: arch/x86/kernel/setup_32.c:409: error: 'enable_local_apic' undeclared (first use in this function) arch/x86/kernel/setup_32.c:409: error: (Each undeclared identifier is reported only once arch/x86/kernel/setup_32.c:409: error: for each function it appears in.) Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b42f570..1e67037 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -406,7 +406,9 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); if (acpi_mps_check()){ +#ifdef CONFIG_X86_LOCAL_APIC enable_local_apic = -1; +#endif clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); } -- cgit v1.1 From 6a07a0edacba397205ff97308b22c6b6aab9f791 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 14:02:36 -0700 Subject: x86: fix compile warning in init_64.c len is long and ret is only for NUMA Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 955dbc8..54a9840 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -830,9 +830,9 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, { #ifdef CONFIG_NUMA int nid, next_nid; + int ret; #endif unsigned long pfn = phys >> PAGE_SHIFT; - int ret; if (pfn >= end_pfn) { /* @@ -842,7 +842,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, if (pfn < max_pfn_mapped) return -EFAULT; - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", phys, len); return -EFAULT; } -- cgit v1.1 From c09434571d4b1d8abf530ba4ce28cb868b45f2e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 16:41:30 -0700 Subject: x86: numa32 pfn print out using hex instead Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/srat_32.c | 31 +++++++++++++++++++------------ arch/x86/mm/discontig_32.c | 29 +++++++++++++++-------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 5978023..f41d67f 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -93,7 +93,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - printk("CPU 0x%02X in proximity domain 0x%02X\n", + printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); } @@ -134,7 +134,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) if (num_memory_chunks >= MAXCHUNKS) { - printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", + printk(KERN_WARNING "Too many mem chunks in SRAT." + " Ignoring %lld MBytes at %llx\n", size/(1024*1024), paddr); return; } @@ -155,7 +156,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) num_memory_chunks++; - printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", + printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" + " in proximity domain %02x %s\n", start_pfn, end_pfn, memory_affinity->memory_type, pxm, @@ -186,7 +188,7 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c * *possible* memory hotplug areas the same as normal RAM. */ if (memory_chunk->start_pfn >= max_pfn) { - printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", memory_chunk->start_pfn, memory_chunk->end_pfn); return; } @@ -212,7 +214,8 @@ int __init get_memcfg_from_srat(void) goto out_fail; if (num_memory_chunks == 0) { - printk("could not finy any ACPI SRAT memory areas.\n"); + printk(KERN_WARNING + "could not finy any ACPI SRAT memory areas.\n"); goto out_fail; } @@ -239,20 +242,23 @@ int __init get_memcfg_from_srat(void) for (i = 0; i < num_memory_chunks; i++) node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - printk("pxm bitmap: "); + printk(KERN_DEBUG "pxm bitmap: "); for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk("%02X ", pxm_bitmap[i]); + printk(KERN_CONT "%02x ", pxm_bitmap[i]); } - printk("\n"); - printk("Number of logical nodes in system = %d\n", num_online_nodes()); - printk("Number of memory chunks in system = %d\n", num_memory_chunks); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "Number of logical nodes in system = %d\n", + num_online_nodes()); + printk(KERN_DEBUG "Number of memory chunks in system = %d\n", + num_memory_chunks); for (i = 0; i < MAX_APICID; i++) apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", + printk(KERN_DEBUG + "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); e820_register_active_regions(chunk->nid, chunk->start_pfn, @@ -268,6 +274,7 @@ int __init get_memcfg_from_srat(void) } return 1; out_fail: - printk("failed to get NUMA memory information from SRAT table\n"); + printk(KERN_ERR "failed to get NUMA memory information from SRAT" + " table\n"); return 0; } diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 1dfff70..f5ae319 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -76,13 +76,13 @@ void memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; - printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%ld ", pfn); + printk(KERN_CONT "%lx ", pfn); } printk(KERN_CONT "\n"); } @@ -117,7 +117,7 @@ static unsigned long kva_pages; */ int __init get_memcfg_numa_flat(void) { - printk("NUMA - single node, flat memory mode\n"); + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; @@ -233,7 +233,7 @@ static unsigned long calculate_numa_remap_pages(void) * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ - printk("node %d pfn: [%lx - %lx]\n", + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; @@ -268,7 +268,8 @@ static unsigned long calculate_numa_remap_pages(void) node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", size, nid, node_kva_final>>PAGE_SHIFT); /* @@ -290,7 +291,7 @@ static unsigned long calculate_numa_remap_pages(void) remove_active_range(nid, node_remap_start_pfn[nid], node_remap_start_pfn[nid] + size); } - printk("Reserving total of %ld pages for numa KVA remap\n", + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); return reserve_pages; } @@ -304,7 +305,7 @@ static void init_remap_allocator(int nid) node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) node_remap_end_vaddr[nid]); } @@ -340,9 +341,9 @@ void __init initmem_init(unsigned long start_pfn, if (kva_start_pfn == -1UL) panic("Can not get kva space\n"); - printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", kva_start_pfn, max_low_pfn); - printk("max_pfn = %ld\n", max_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); /* avoid clash with initrd */ reserve_early(kva_start_pfn<<PAGE_SHIFT, @@ -362,17 +363,17 @@ void __init initmem_init(unsigned long start_pfn, #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", - min_low_pfn, max_low_pfn, highstart_pfn); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); - printk("Low memory ends at vaddr %08lx\n", + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) { init_remap_allocator(nid); allocate_pgdat(nid); } - printk("High memory starts at vaddr %08lx\n", + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) propagate_e820_map_node(nid); @@ -413,7 +414,7 @@ void __init set_highmem_pages_init(void) zone_end_pfn = zone_start_pfn + zone->spanned_pages; nid = zone_to_nid(zone); - printk("Initializing %s for node %d (%08lx:%08lx)\n", + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, -- cgit v1.1 From 11cd0bc140b5d66566c9eb49c1058737888cd75c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 19:51:10 -0700 Subject: x86: move some func calling from setup_arch to paging_init those function depend on paging setup pgtable, so they could access the ram in bootmem region but just get mapped. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 34 ++-------------------------------- arch/x86/mm/init_32.c | 29 +++++++++++++++++++++++++++++ include/asm-x86/setup.h | 1 + 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 1e67037..220d92f 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -249,7 +249,7 @@ static void __init reserve_initrd(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -static void __init post_reserve_initrd(void) +void __init post_reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; @@ -307,29 +307,11 @@ static void __init post_reserve_initrd(void) void __init reserve_initrd(void) { } -static void __init post_reserve_initrd(void) +void __init post_reserve_initrd(void) { } #endif /* CONFIG_BLK_DEV_INITRD */ -/* - * The node 0 pgdat is initialized before all of these because - * it's needed for bootmem. node>0 pgdats have their virtual - * space allocated before the pagetables are in place to access - * them, so they can't be cleared then. - * - * This should all compile down to nothing when NUMA is off. - */ -static void __init remapped_pgdat_init(void) -{ - int nid; - - for_each_online_node(nid) { - if (nid != 0) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - } -} - #ifdef CONFIG_MCA static void set_mca_bus(int x) { @@ -524,18 +506,6 @@ void __init setup_arch(char **cmdline_p) init_ohci1394_dma_on_all_controllers(); #endif - /* - * NOTE: at this point the bootmem allocator is fully available. - */ - - post_reserve_initrd(); - - remapped_pgdat_init(); - sparse_init(); - zone_sizes_init(); - - paravirt_post_allocator_init(); - #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9bb35cf..20ca295 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -705,6 +705,23 @@ void __init setup_bootmem_allocator(void) } +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +static void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} /* * paging_init() sets up the page tables - note that the first 8MB are @@ -727,6 +744,18 @@ void __init paging_init(void) __flush_tlb_all(); kmap_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + + post_reserve_initrd(); + + remapped_pgdat_init(); + sparse_init(); + zone_sizes_init(); + + paravirt_post_allocator_init(); } /* diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index bb12a16..4ebb4ef 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -39,6 +39,7 @@ void reserve_crashkernel(void); #include <asm/bootparam.h> void reserve_standard_io_resources(void); +extern void post_reserve_initrd(void); #ifndef _SETUP -- cgit v1.1 From 7465252ea0121c9cd28be68dfb86293a7554a111 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 19:53:33 -0700 Subject: x86: setup_arch 32bit move efi check later Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 220d92f..52f4e01 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -336,14 +336,6 @@ void __init setup_arch(char **cmdline_p) early_ioremap_init(); reserve_setup_data(); -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) { - efi_enabled = 1; - efi_reserve_early(); - } -#endif - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; @@ -363,10 +355,17 @@ void __init setup_arch(char **cmdline_p) rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) { + efi_enabled = 1; + efi_reserve_early(); + } +#endif + ARCH_SETUP setup_memory_map(); - copy_edd(); if (!boot_params.hdr.root_flags) -- cgit v1.1 From 9a2e59302668b9ac2fb2a2c9bca1fc793c5d0409 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 19:54:23 -0700 Subject: x86: setup_arch 32bit move command line copying early Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 52f4e01..243b2f7 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -382,6 +382,9 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + parse_setup_data(); parse_early_param(); @@ -402,9 +405,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - if (efi_enabled) efi_init(); -- cgit v1.1 From 295deae401fc5b6f215e876d93b40f25cb968c88 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 23 Jun 2008 19:55:05 -0700 Subject: x86: setup_arch 32bit move kvm_guest_init later Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 243b2f7..bba8d57 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -482,8 +482,6 @@ void __init setup_arch(char **cmdline_p) */ vmi_init(); #endif - kvm_guest_init(); - /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the @@ -511,9 +509,15 @@ void __init setup_arch(char **cmdline_p) early_quirks(); + /* + * Read APIC and some other early information from ACPI tables. + */ acpi_boot_init(); #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + /* + * get boot-time SMP configuration: + */ if (smp_found_config) get_smp_config(); #endif @@ -523,6 +527,7 @@ void __init setup_arch(char **cmdline_p) "CONFIG_X86_PC cannot handle it.\nUse " "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); #endif + kvm_guest_init(); e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -- cgit v1.1 From 157fabf09594ab064b7ae92c81942af4b94663cb Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Sun, 22 Jun 2008 07:21:57 -0700 Subject: x86 boot: e820 code indentation fix Fix indentation. An earlier code merge got the indentation of four lines of code off by a tab. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e285ea3..8a8afbd 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -207,10 +207,10 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; -static struct change_member change_point_list[2*E820_X_MAX] __initdata; -static struct change_member *change_point[2*E820_X_MAX] __initdata; -static struct e820entry *overlap_list[E820_X_MAX] __initdata; -static struct e820entry new_bios[E820_X_MAX] __initdata; + static struct change_member change_point_list[2*E820_X_MAX] __initdata; + static struct change_member *change_point[2*E820_X_MAX] __initdata; + static struct e820entry *overlap_list[E820_X_MAX] __initdata; + static struct e820entry new_bios[E820_X_MAX] __initdata; struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; -- cgit v1.1 From 05486fa7e631a3be31a0bbc5a575a389a1609e94 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Sun, 22 Jun 2008 07:22:02 -0700 Subject: x86 boot: x86_64 efi compiler warning fix Fix a compiler warning. Rather than always casting a u32 to a pointer (which generates a warning on x86_64 systems) instead separate the x86_32 and x86_64 assignments entirely with ifdefs. Until other recent changes to this code, it used to have x86_64 separated like this. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/efi.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 473c89f..a03ca36 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -242,9 +242,11 @@ void __init efi_reserve_early(void) { unsigned long pmap; +#ifdef CONFIG_X86_32 pmap = boot_params.efi_info.efi_memmap; -#ifdef CONFIG_X86_64 - pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32; +#else + pmap = (boot_params.efi_info.efi_memmap | + ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); #endif memmap.phys_map = (void *)pmap; memmap.nr_map = boot_params.efi_info.efi_memmap_size / @@ -284,10 +286,12 @@ void __init efi_init(void) int i = 0; void *tmp; +#ifdef CONFIG_X86_32 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; -#ifdef CONFIG_X86_64 - efi_phys.systab = (void *)efi_phys.systab + - ((__u64)boot_params.efi_info.efi_systab_hi<<32); +#else + efi_phys.systab = (efi_system_table_t *) + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); #endif efi.systab = early_ioremap((unsigned long)efi_phys.systab, -- cgit v1.1 From c4ba1320b7075e9ce33ad0afaef43ba13260b4c2 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Sun, 22 Jun 2008 07:22:07 -0700 Subject: x86 boot: allow overlapping early reserve memory ranges Add support for overlapping early memory reservations. In general, they still can't overlap, and will panic with "Overlapping early reservations" if they do overlap. But if a memory range is reserved with the new call: reserve_early_overlap_ok() rather than with the usual call: reserve_early() then subsequent early reservations are allowed to overlap. This new reserve_early_overlap_ok() call is only used in one place so far, which is the "BIOS reserved" reservation for the the EBDA region, which out of Paranoia reserves more than what the BIOS might have specified, and which thus might overlap with another legitimate early memory reservation (such as, perhaps, the EFI memmap.) Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 140 +++++++++++++++++++++++++++++++++++++++++++++---- arch/x86/kernel/head.c | 2 +- include/asm-x86/e820.h | 1 + 3 files changed, 133 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8a8afbd..600c9de 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -604,6 +604,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) struct early_res { u64 start, end; char name[16]; + char overlap_ok; }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ @@ -640,7 +641,93 @@ static int __init find_overlapped_early(u64 start, u64 end) return i; } -void __init reserve_early(u64 start, u64 end, char *name) +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[16]; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) { int i; struct early_res *r; @@ -656,14 +743,55 @@ void __init reserve_early(u64 start, u64 end, char *name) r->end - 1, r->name); r->start = start; r->end = end; + r->overlap_ok = overlap_ok; if (name) strncpy(r->name, name, sizeof(r->name) - 1); } +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + void __init free_early(u64 start, u64 end) { struct early_res *r; - int i, j; + int i; i = find_overlapped_early(start, end); r = &early_res[i]; @@ -671,13 +799,7 @@ void __init free_early(u64 start, u64 end) panic("free_early on not reserved area: %llx-%llx!", start, end - 1); - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; + drop_range(i); } void __init early_res_to_bootmem(u64 start, u64 end) diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index a727c0b..a6816be 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -51,7 +51,7 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - reserve_early(lowmem, 0x100000, "BIOS reserved"); + reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); } void __init reserve_setup_data(void) diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 7c32df0..13fa5a0 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -88,6 +88,7 @@ extern unsigned long end_user_pfn; extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); +extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); -- cgit v1.1 From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Sun, 22 Jun 2008 07:22:12 -0700 Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Page frame numbers (the portion of physical addresses above the low order page offsets) are displayed in several kernel debug and info prints in decimal, not hex. Decimal addresse are unreadable. Use hex. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 2 +- mm/page_alloc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 600c9de..512f779 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -990,7 +990,7 @@ unsigned long __init e820_end_of_ram(void) if (last_pfn > end_user_pfn) last_pfn = end_user_pfn; - printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n", + printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n", last_pfn, max_arch_pfn); return last_pfn; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e25b6b2..c09c2c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %8lu -> %8lu\n", + printk(" %-8s 0x%8lx -> 0x%8lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.1 From 2bc0d2615a15a93d344abbe8cb1b9056122bce9d Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Sun, 22 Jun 2008 07:22:17 -0700 Subject: x86 boot: more consistently use type int for node ids Everywhere I look, node id's are of type 'int', except in this one case, which has 'unsigned long'. Change this one to 'int' as well. There is nothing special about the way this variable 'nid' is used in this routine to justify using an unusual type here. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c09c2c8..b604c64 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3669,7 +3669,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(unsigned long nid) +unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3680,7 +3680,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) if (min_pfn == ULONG_MAX) { printk(KERN_WARNING - "Could not find start_pfn for node %lu\n", nid); + "Could not find start_pfn for node %d\n", nid); return 0; } -- cgit v1.1 From 47a486cc110fe77518c79a566b50a5c785c813ae Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Jun 2008 22:52:03 +0200 Subject: x86: perfctr-watchdog.c - coding style cleanup Just some code beautification. Nothing else. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/perfctr-watchdog.c | 202 ++++++++++++++++++--------------- 1 file changed, 112 insertions(+), 90 deletions(-) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index ddda4b6..2e9bef6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -1,11 +1,15 @@ -/* local apic based NMI watchdog for various CPUs. - This file also handles reservation of performance counters for coordination - with other users (like oprofile). - - Note that these events normally don't tick when the CPU idles. This means - the frequency varies with CPU load. - - Original code for K7/P6 written by Keith Owens */ +/* + * local apic based NMI watchdog for various CPUs. + * + * This file also handles reservation of performance counters for coordination + * with other users (like oprofile). + * + * Note that these events normally don't tick when the CPU idles. This means + * the frequency varies with CPU load. + * + * Original code for K7/P6 written by Keith Owens + * + */ #include <linux/percpu.h> #include <linux/module.h> @@ -36,12 +40,16 @@ struct wd_ops { static const struct wd_ops *wd_ops; -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) +/* + * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. + * + * It will be the max for all platforms (for now) */ #define NMI_MAX_COUNTER_BITS 66 -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: +/* + * perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection * - different performance counters/ event selection may be reserved for * different subsystems this reservation system just tries to coordinate @@ -73,8 +81,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) return 0; } -/* converts an msr to an appropriate reservation bit */ -/* returns the bit offset of the event selection register */ +/* + * converts an msr to an appropriate reservation bit + * returns the bit offset of the event selection register + */ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) { /* returns the bit offset of the event selection register */ @@ -114,6 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) return (!test_bit(counter, perfctr_nmi_owner)); } +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) { @@ -128,6 +139,7 @@ int reserve_perfctr_nmi(unsigned int msr) return 1; return 0; } +EXPORT_SYMBOL(reserve_perfctr_nmi); void release_perfctr_nmi(unsigned int msr) { @@ -140,6 +152,7 @@ void release_perfctr_nmi(unsigned int msr) clear_bit(counter, perfctr_nmi_owner); } +EXPORT_SYMBOL(release_perfctr_nmi); int reserve_evntsel_nmi(unsigned int msr) { @@ -154,6 +167,7 @@ int reserve_evntsel_nmi(unsigned int msr) return 1; return 0; } +EXPORT_SYMBOL(reserve_evntsel_nmi); void release_evntsel_nmi(unsigned int msr) { @@ -166,11 +180,6 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } - -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); void disable_lapic_nmi_watchdog(void) @@ -234,8 +243,8 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) return retval; } -static void -write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) +static void write_watchdog_counter(unsigned int perfctr_msr, + const char *descr, unsigned nmi_hz) { u64 count = (u64)cpu_khz * 1000; @@ -246,7 +255,7 @@ write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi } static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) + const char *descr, unsigned nmi_hz) { u64 count = (u64)cpu_khz * 1000; @@ -256,9 +265,10 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, wrmsr(perfctr_msr, (u32)(-count), 0); } -/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface - nicely stable so there is not much variety */ - +/* + * AMD K7/K8/Family10h/Family11h support. + * AMD keeps this interface nicely stable so there is not much variety + */ #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) @@ -291,7 +301,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused + wd->cccr_msr = 0; /* unused */ return 1; } @@ -327,18 +337,19 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) } static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL<<47, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_k7_watchdog, + .rearm = single_msr_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_K7_PERFCTR0, + .evntsel = MSR_K7_EVNTSEL0, + .checkbit = 1ULL << 47, }; -/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ - +/* + * Intel Model 6 (PPro+,P2,P3,P-M,Core1) + */ #define P6_EVNTSEL0_ENABLE (1 << 22) #define P6_EVNTSEL_INT (1 << 20) #define P6_EVNTSEL_OS (1 << 17) @@ -374,52 +385,58 @@ static int setup_p6_watchdog(unsigned nmi_hz) wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused + wd->cccr_msr = 0; /* unused */ return 1; } static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { - /* P6 based Pentium M need to re-unmask + /* + * P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant. - * ArchPerfom/Core Duo also needs this */ + * ArchPerfom/Core Duo also needs this + */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); } static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL<<39, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_p6_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_P6_PERFCTR0, + .evntsel = MSR_P6_EVNTSEL0, + .checkbit = 1ULL << 39, }; -/* Intel P4 performance counters. By far the most complicated of all. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) - -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ +/* + * Intel P4 performance counters. + * By far the most complicated of all. + */ +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) +#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) +#define P4_ESCR_OS (1 << 3) +#define P4_ESCR_USR (1 << 2) +#define P4_CCCR_OVF_PMI0 (1 << 26) +#define P4_CCCR_OVF_PMI1 (1 << 27) +#define P4_CCCR_THRESHOLD(N) ((N) << 20) +#define P4_CCCR_COMPLEMENT (1 << 19) +#define P4_CCCR_COMPARE (1 << 18) +#define P4_CCCR_REQUIRED (3 << 16) +#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) +#define P4_CCCR_ENABLE (1 << 12) +#define P4_CCCR_OVF (1 << 31) +/* + * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + * CRU_ESCR0 (with any non-null event selector) through a complemented + * max threshold. [IA32-Vol3, Section 14.9.9] + */ static int setup_p4_watchdog(unsigned nmi_hz) { unsigned int perfctr_msr, evntsel_msr, cccr_msr; @@ -444,7 +461,8 @@ static int setup_p4_watchdog(unsigned nmi_hz) #endif ht_num = 0; - /* performance counters are shared resources + /* + * performance counters are shared resources * assign each hyperthread its own set * (re-use the ESCR0 register, seems safe * and keeps the cccr_val the same) @@ -542,20 +560,21 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) } static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, + .reserve = p4_reserve, + .unreserve = p4_unreserve, + .setup = setup_p4_watchdog, + .rearm = p4_rearm, + .stop = stop_p4_watchdog, /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL<<39, + .perfctr = MSR_P4_BPU_PERFCTR0, + .evntsel = MSR_P4_BSU_ESCR0, + .checkbit = 1ULL << 39, }; -/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully - all future Intel CPUs. */ - +/* + * Watchdog using the Intel architected PerfMon. + * Used for Core2 and hopefully all future Intel CPUs. + */ #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK @@ -601,19 +620,19 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused + wd->cccr_msr = 0; /* unused */ intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_intel_arch_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_ARCH_PERFMON_PERFCTR1, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; static void probe_nmi_watchdog(void) @@ -626,8 +645,10 @@ static void probe_nmi_watchdog(void) wd_ops = &k7_wd_ops; break; case X86_VENDOR_INTEL: - /* Work around Core Duo (Yonah) errata AE49 where perfctr1 - doesn't have a working enable bit. */ + /* + * Work around Core Duo (Yonah) errata AE49 where perfctr1 + * doesn't have a working enable bit. + */ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; @@ -638,7 +659,7 @@ static void probe_nmi_watchdog(void) } switch (boot_cpu_data.x86) { case 6: - if (boot_cpu_data.x86_model > 0xd) + if (boot_cpu_data.x86_model > 13) return; wd_ops = &p6_wd_ops; @@ -699,10 +720,11 @@ int lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; + rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) { /* perfctr still running? */ + if (ctr & wd_ops->checkbit) /* perfctr still running? */ return 0; - } + wd_ops->rearm(wd, nmi_hz); return 1; } -- cgit v1.1 From 116f570e5d8347bee2614da2361eeadbb639ea50 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Jun 2008 22:52:04 +0200 Subject: x86: nmi_watchdog - use nmi_watchdog variable for printing Since it is possible NMI_ definitions could be changed one day we better print out real nmi_watchdog value instead of constant string. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4932d78..b5571fb 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -550,7 +550,7 @@ void __init setup_boot_APIC_clock(void) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; else printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=1!\n"); + " due to nmi_watchdog=%d!\n", nmi_watchdog); } /* Setup the lapic or request the broadcast */ diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5279dc9..8c751f7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -417,7 +417,7 @@ void __init setup_boot_APIC_clock(void) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; else printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=1!\n"); + " due to nmi_watchdog=%d!\n", nmi_watchdog); setup_APIC_timer(); } -- cgit v1.1 From 2b6addad2d67a2d75ae10a1c8efd18d81d78ff82 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Jun 2008 22:52:04 +0200 Subject: x86: nmi_watchdog - remove useless check Since nmi_watchdog is unsigned variable we may safely remove the check for negative value. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9ebf713..427c511 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -187,7 +187,7 @@ error: static int __init setup_nmi_watchdog(char *str) { - int nmi; + unsigned int nmi; if (!strncmp(str, "panic", 5)) { panic_on_timeout = 1; @@ -199,7 +199,7 @@ static int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID || nmi < NMI_NONE) + if (nmi >= NMI_INVALID) return 0; nmi_watchdog = nmi; -- cgit v1.1 From c376d45432d935e6f1e0ff2d6be3734bcd3ba455 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Jun 2008 22:52:05 +0200 Subject: x86: nmi_watchdog - use NMI_NONE by default There is no need to keep NMI_DISABLED definition and use it for nmi_watchdog by default. Here is the point why: - IO-APIC and APIC chips are programmed for nmi_watchdog support at very early stage of kernel booting and not having nmi_watchdog specified as boot option lead only to nmi_watchdog becomes to NMI_NONE anyway - enable nmi_watchdog thru /proc/sys/kernel/nmi if it was not specified at boot is not possible too (even having this sysfs entry) Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_64.c | 1 - arch/x86/kernel/io_apic_64.c | 2 -- arch/x86/kernel/nmi.c | 26 +++----------------------- arch/x86/kernel/smpboot.c | 1 - include/asm-x86/nmi.h | 3 --- 5 files changed, 3 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 8c751f7..1e3d32e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -826,7 +826,6 @@ static void __cpuinit lapic_setup_esr(void) void __cpuinit end_local_APIC_setup(void) { lapic_setup_esr(); - nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 08c4875..2b4c40b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1729,7 +1729,6 @@ static inline void __init check_timer(void) } unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); enable_8259A_irq(0); @@ -1758,7 +1757,6 @@ static inline void __init check_timer(void) if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); timer_through_8259 = 1; - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 427c511..32acda2 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -52,7 +52,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -unsigned int nmi_watchdog = NMI_DEFAULT; +unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); static int panic_on_timeout; @@ -92,14 +92,6 @@ static inline unsigned int get_timer_irqs(int cpu) #endif } -/* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - nmi_watchdog = NMI_NONE; -} - #ifdef CONFIG_SMP /* * The performance counters used by NMI_LOCAL_APIC don't trigger when @@ -127,7 +119,7 @@ int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; - if (nmi_watchdog == NMI_NONE || nmi_watchdog == NMI_DISABLED) + if (nmi_watchdog == NMI_NONE) return 0; if (!atomic_read(&nmi_active)) @@ -482,24 +474,12 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (!!old_state == !!nmi_watchdog_enabled) return 0; - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_NONE) { printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); return -EIO; } - /* if nmi_watchdog is not set yet, then set it */ - nmi_watchdog_default(); - -#ifdef CONFIG_X86_32 - if (nmi_watchdog == NMI_NONE) { - if (lapic_watchdog_ok()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; - } -#endif - if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3b48d1f..3b19441 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1139,7 +1139,6 @@ static void __init smp_cpu_index_default(void) void __init native_smp_prepare_cpus(unsigned int max_cpus) { preempt_disable(); - nmi_watchdog_default(); smp_cpu_index_default(); current_cpu_data = boot_cpu_data; cpu_callin_map = cpumask_of_cpu(0); diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index f0e435d..1348e54 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -20,7 +20,6 @@ extern void default_do_nmi(struct pt_regs *); #endif extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -extern void nmi_watchdog_default(void); extern int check_nmi_watchdog(void); extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); @@ -38,12 +37,10 @@ extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); extern atomic_t nmi_active; extern unsigned int nmi_watchdog; -#define NMI_DISABLED -1 #define NMI_NONE 0 #define NMI_IO_APIC 1 #define NMI_LOCAL_APIC 2 #define NMI_INVALID 3 -#define NMI_DEFAULT NMI_DISABLED struct ctl_table; struct file; -- cgit v1.1 From 4de0043617f949fdac538fd59335e2150cd1b863 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 24 Jun 2008 22:52:06 +0200 Subject: x86: nmi_watchdog - introduce nmi_watchdog_active() helper Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/nmi.c | 13 ++++--------- include/asm-x86/nmi.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 32acda2..8dfe9db 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -119,10 +119,7 @@ int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; - if (nmi_watchdog == NMI_NONE) - return 0; - - if (!atomic_read(&nmi_active)) + if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) return 0; prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); @@ -317,8 +314,7 @@ void setup_apic_nmi_watchdog(void *unused) void stop_apic_nmi_watchdog(void *unused) { /* only support LOCAL and IO APICs for now */ - if (nmi_watchdog != NMI_LOCAL_APIC && - nmi_watchdog != NMI_IO_APIC) + if (!nmi_watchdog_active()) return; if (__get_cpu_var(wd_enabled) == 0) return; @@ -348,8 +344,7 @@ static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) { - if (nmi_watchdog == NMI_LOCAL_APIC || - nmi_watchdog == NMI_IO_APIC) { + if (nmi_watchdog_active()) { unsigned cpu; /* @@ -474,7 +469,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (!!old_state == !!nmi_watchdog_enabled) return 0; - if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_NONE) { + if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); return -EIO; diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h index 1348e54..21f8d02 100644 --- a/include/asm-x86/nmi.h +++ b/include/asm-x86/nmi.h @@ -56,6 +56,19 @@ static inline void localise_nmi_watchdog(void) if (nmi_watchdog == NMI_IO_APIC) nmi_watchdog = NMI_LOCAL_APIC; } + +/* check if nmi_watchdog is active (ie was specified at boot) */ +static inline int nmi_watchdog_active(void) +{ + /* + * actually it should be: + * return (nmi_watchdog == NMI_LOCAL_APIC || + * nmi_watchdog == NMI_IO_APIC) + * but since they are power of two we could use a + * cheaper way --cvg + */ + return nmi_watchdog & 0x3; +} #endif void lapic_watchdog_stop(void); -- cgit v1.1 From bea41808efdd8815435376209f23f406f8bf435f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:18:57 -0400 Subject: x86: asm-x86/pgtable.h: fix compiler warning Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index f048975..1330e74 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -425,6 +425,8 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, * race with other CPU's that might be updating the dirty * bit at the same time. */ +struct vm_area_struct; + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, -- cgit v1.1 From d338c73c39a6ed0d07fe3bb07c7f12fff0dd237d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:18:58 -0400 Subject: x86: add memory clobber to save/loadsegment Add "memory" clobbers to savesegment and loadsegment, since they can affect memory accesses and we never want the compiler to reorder them with respect to memory references. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/system.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index bacfcee..f686aa6 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -153,14 +153,14 @@ extern void load_gs_index(unsigned); "jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b,3b) \ - : :"r" (value), "r" (0)) + : :"r" (value), "r" (0) : "memory") /* * Save a segment register away */ #define savesegment(seg, value) \ - asm volatile("mov %%" #seg ",%0":"=rm" (value)) + asm("mov %%" #seg ",%0":"=rm" (value) : : "memory") static inline unsigned long get_limit(unsigned long segment) { -- cgit v1.1 From af2b1c609ff52b6469d8e67696db98c93c348b0e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:18:59 -0400 Subject: x86: add memory barriers to wrmsr wrmsr is a special instruction which can have arbitrary system-wide effects. We don't want the compiler to reorder it with respect to memory operations, so make it a memory barrier. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/msr.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h index 2b5f2c9..ca110ee 100644 --- a/include/asm-x86/msr.h +++ b/include/asm-x86/msr.h @@ -66,7 +66,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, static inline void native_write_msr(unsigned int msr, unsigned low, unsigned high) { - asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high)); + asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } static inline int native_write_msr_safe(unsigned int msr, @@ -81,7 +81,8 @@ static inline int native_write_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : "=a" (err) : "c" (msr), "0" (low), "d" (high), - "i" (-EFAULT)); + "i" (-EFAULT) + : "memory"); return err; } -- cgit v1.1 From ada857082317e6883cfcf7deb4e0c54d3c447cb0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:00 -0400 Subject: x86: remove open-coded save/load segment operations This removes a pile of buggy open-coded implementations of savesegment and loadsegment. (They are buggy because they don't have memory barriers to prevent them from being reordered with respect to memory accesses.) Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/cpu/common_64.c | 3 ++- arch/x86/kernel/process_64.c | 28 +++++++++++++++------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 39eaefb..7518502 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -480,7 +480,8 @@ void pda_init(int cpu) struct x8664_pda *pda = cpu_pda(cpu); /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + loadsegment(fs, 0); + loadsegment(gs, 0); /* Memory clobbers used to order PDA accessed */ mb(); wrmsrl(MSR_GS_BASE, pda); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 290183e..ddc6fcc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -335,10 +335,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; - asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); - asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); - asm("mov %%es,%0" : "=m" (p->thread.es)); - asm("mov %%ds,%0" : "=m" (p->thread.ds)); + savesegment(gs, p->thread.gsindex); + savesegment(fs, p->thread.fsindex); + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -377,7 +377,9 @@ out: void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); + loadsegment(fs, 0); + loadsegment(es, 0); + loadsegment(ds, 0); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; @@ -550,11 +552,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch DS and ES. * This won't pick up thread selector changes, but I guess that is ok. */ - asm volatile("mov %%es,%0" : "=m" (prev->es)); + savesegment(es, prev->es); if (unlikely(next->es | prev->es)) loadsegment(es, next->es); - - asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); + + savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); @@ -565,7 +567,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ { unsigned fsindex; - asm volatile("movl %%fs,%0" : "=r" (fsindex)); + savesegment(fs, fsindex); /* segment register != 0 always requires a reload. also reload when it has changed. when prev process used 64bit base always reload @@ -586,7 +588,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } { unsigned gsindex; - asm volatile("movl %%gs,%0" : "=r" (gsindex)); + savesegment(gs, gsindex); if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); if (gsindex) @@ -767,7 +769,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) set_32bit_tls(task, FS_TLS, addr); if (doit) { load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); + loadsegment(fs, FS_TLS_SEL); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; @@ -777,7 +779,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (doit) { /* set the selector to 0 to not confuse __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); + loadsegment(fs, 0); ret = checking_wrmsrl(MSR_FS_BASE, addr); } } @@ -800,7 +802,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + savesegment(gs, gsindex); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else -- cgit v1.1 From fc8b8a60ffa7c89da58c75109dacf0b2798c7caf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:01 -0400 Subject: x86, 64-bit: use write_gdt_entry in vsyscall_set_cpu Use write_gdt_entry to generate the special vgetcpu descriptor in the vsyscall page. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/vsyscall_64.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 61efa2f..c87cbd8 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -249,7 +249,7 @@ static ctl_table kernel_root_table2[] = { doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) { - unsigned long *d; + unsigned long d; unsigned long node = 0; #ifdef CONFIG_NUMA node = cpu_to_node(cpu); @@ -260,11 +260,11 @@ static void __cpuinit vsyscall_set_cpu(int cpu) /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); - *d = 0x0f40000000000ULL; - *d |= cpu; - *d |= (node & 0xf) << 12; - *d |= (node >> 4) << 48; + d = 0x0f40000000000ULL; + d |= cpu; + d |= (node & 0xf) << 12; + d |= (node >> 4) << 48; + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } static void __cpuinit cpu_vsyscall_init(void *arg) -- cgit v1.1 From bb23e403e5162765dabe3dc78646724753d6359b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:02 -0400 Subject: x86, 64-bit: use p??_populate() to attach pages to pagetable Use the _populate() functions to attach new pages to a pagetable, to make sure the right paravirt_ops calls get called. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 54a9840..bfea760 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -167,7 +167,7 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) { printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud, 0)); @@ -177,7 +177,7 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) pmd = pmd_offset(pud, vaddr); if (pmd_none(*pmd)) { pte = (pte_t *) spp_getpage(); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) { printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; @@ -389,7 +389,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); + pud_populate(&init_mm, pud, __va(pmd_phys)); last_map_addr = phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); @@ -592,7 +592,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon next = end; last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + pgd_populate(&init_mm, pgd_offset_k(start), + __va(pud_phys)); unmap_low_page(pud); } -- cgit v1.1 From 4583ed514ea9ac844a6eb02d33120beaedf6837f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:03 -0400 Subject: x86, 64-bit: unify early_ioremap The 32-bit early_ioremap will work equally well for 64-bit, so just use it. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 52 --------------------------------------------- arch/x86/mm/ioremap.c | 5 +---- include/asm-x86/fixmap_64.h | 13 ++++++++++++ include/asm-x86/io.h | 13 ++++++++++++ include/asm-x86/io_32.h | 12 ----------- 5 files changed, 27 insertions(+), 68 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bfea760..9b60491 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -257,58 +257,6 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -/* Must run before zap_low_mappings */ -__meminit void *early_ioremap(unsigned long addr, unsigned long size) -{ - pmd_t *pmd, *last_pmd; - unsigned long vaddr; - int i, pmds; - - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; - - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { - for (i = 0; i < pmds; i++) { - if (pmd_present(pmd[i])) - goto continue_outer_loop; - } - vaddr += addr & ~PMD_MASK; - addr &= PMD_MASK; - - for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - __flush_tlb_all(); - - return (void *)vaddr; -continue_outer_loop: - ; - } - printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); - - return NULL; -} - -/* - * To avoid virtual aliases later: - */ -__meminit void early_iounmap(void *addr, unsigned long size) -{ - unsigned long vaddr; - pmd_t *pmd; - int i, pmds; - - vaddr = (unsigned long)addr; - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - pmd = level2_kernel_pgt + pmd_index(vaddr); - - for (i = 0; i < pmds; i++) - pmd_clear(pmd + i); - - __flush_tlb_all(); -} - static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0561fde..092b3d7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -381,8 +381,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -#ifdef CONFIG_X86_32 - int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) @@ -483,6 +481,7 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, return; } pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else @@ -624,5 +623,3 @@ void __this_fixmap_does_not_exist(void) { WARN_ON(1); } - -#endif /* CONFIG_X86_32 */ diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h index 6260988..c7dcc36 100644 --- a/include/asm-x86/fixmap_64.h +++ b/include/asm-x86/fixmap_64.h @@ -49,6 +49,19 @@ enum fixed_addresses { #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif + __end_of_permanent_fixed_addresses, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 512 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_NESTING 4 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 512 - + (__end_of_permanent_fixed_addresses & 511), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, __end_of_fixed_addresses }; diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h index 8e9eca9..c63563d 100644 --- a/include/asm-x86/io.h +++ b/include/asm-x86/io.h @@ -72,4 +72,17 @@ extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, unsigned long prot_val); extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +extern void early_ioremap_init(void); +extern void early_ioremap_clear(void); +extern void early_ioremap_reset(void); +extern void *early_ioremap(unsigned long offset, unsigned long size); +extern void early_iounmap(void *addr, unsigned long size); +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); + + #endif /* _ASM_X86_IO_H */ diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h index d71be8d..4df44ed 100644 --- a/include/asm-x86/io_32.h +++ b/include/asm-x86/io_32.h @@ -122,18 +122,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); /* - * early_ioremap() and early_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void early_ioremap_init(void); -extern void early_ioremap_clear(void); -extern void early_ioremap_reset(void); -extern void *early_ioremap(unsigned long offset, unsigned long size); -extern void early_iounmap(void *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); - -/* * ISA I/O bus memory addresses are 1:1 with the physical address. */ #define isa_virt_to_bus virt_to_phys -- cgit v1.1 From 43adfc26dea171558f944adbc9adecddf2d4602f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:04 -0400 Subject: x86, 64-bit: add gate_offset() and gate_segment() macros For calculating the offset from struct gate_struct fields. [ gate_offset and gate_segment were broken for 32-bit. ] Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/desc_defs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h index eccb4ea..f7bacf3 100644 --- a/include/asm-x86/desc_defs.h +++ b/include/asm-x86/desc_defs.h @@ -75,10 +75,14 @@ struct ldttss_desc64 { typedef struct gate_struct64 gate_desc; typedef struct ldttss_desc64 ldt_desc; typedef struct ldttss_desc64 tss_desc; +#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) +#define gate_segment(g) ((g).segment) #else typedef struct desc_struct gate_desc; typedef struct desc_struct ldt_desc; typedef struct desc_struct tss_desc; +#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) +#define gate_segment(g) ((g).a >> 16) #endif struct desc_ptr { -- cgit v1.1 From e7a9b0b3c32aa13f4c766eb6a4e7038260718d4c Mon Sep 17 00:00:00 2001 From: Eduardo Habkost <ehabkost@redhat.com> Date: Wed, 25 Jun 2008 00:19:05 -0400 Subject: x86, 64-bit: use __pgd() on mk_kernel_pgd() Use __pgd() on mk_kernel_pgd() Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/pgtable_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 1cc50d2..23f35ff 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -195,7 +195,7 @@ static inline int pmd_bad(pmd_t pmd) #define pgd_offset_k(address) (init_level4_pgt + pgd_index((address))) #define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) static inline int pgd_large(pgd_t pgd) { return 0; } -#define mk_kernel_pgd(address) ((pgd_t){ (address) | _KERNPG_TABLE }) +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ /* to find an entry in a page-table-directory. */ -- cgit v1.1 From fb15a9b3047a245a30a51696e4d8e29b1175a598 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:06 -0400 Subject: x86: unify pgd_index pgd_index is common for 32 and 64-bit, so move it to a common place. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/pgtable.h | 20 ++++++++++++++++++++ include/asm-x86/pgtable_32.h | 20 -------------------- include/asm-x86/pgtable_64.h | 3 --- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 1330e74..85f851a 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -357,6 +357,26 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); # include "pgtable_64.h" #endif +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) + + #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index 32ca031..ec871c4 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -113,26 +113,6 @@ extern unsigned long pg0[]; */ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -/* - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] - * - * this macro returns the index of the entry in the pgd page which would - * control the given virtual address - */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_index_k(addr) pgd_index((addr)) - -/* - * pgd_offset() returns a (pgd_t *) - * pgd_index() is used get the offset into the pgd page's array of pgd_t's; - */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) - -/* - * a shortcut which implies the use of the kernel's pgd, instead - * of a process's - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) static inline int pud_large(pud_t pud) { return 0; } diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 23f35ff..9a9350a 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -190,9 +190,6 @@ static inline int pmd_bad(pmd_t pmd) #define pgd_page_vaddr(pgd) \ ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_MASK)) #define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT)) -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) -#define pgd_offset_k(address) (init_level4_pgt + pgd_index((address))) #define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) -- cgit v1.1 From c3c2fee38462fa34b90e0a5427c7fc564bb5c96c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:07 -0400 Subject: x86: unify mmu_context.h Some amount of asm-x86/mmu_context.h can be unified, including activate_mm paravirt hook. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mmu_context.h | 32 ++++++++++++++++++++++++++++++++ include/asm-x86/mmu_context_32.h | 28 ---------------------------- include/asm-x86/mmu_context_64.h | 18 ------------------ 3 files changed, 32 insertions(+), 46 deletions(-) diff --git a/include/asm-x86/mmu_context.h b/include/asm-x86/mmu_context.h index 6598450..fac5701 100644 --- a/include/asm-x86/mmu_context.h +++ b/include/asm-x86/mmu_context.h @@ -1,5 +1,37 @@ +#ifndef __ASM_X86_MMU_CONTEXT_H +#define __ASM_X86_MMU_CONTEXT_H + +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/paravirt.h> +#ifndef CONFIG_PARAVIRT +#include <asm-generic/mm_hooks.h> + +static inline void paravirt_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ +} +#endif /* !CONFIG_PARAVIRT */ + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + #ifdef CONFIG_X86_32 # include "mmu_context_32.h" #else # include "mmu_context_64.h" #endif + +#define activate_mm(prev, next) \ +do { \ + paravirt_activate_mm((prev), (next)); \ + switch_mm((prev), (next), NULL); \ +} while (0); + + +#endif /* __ASM_X86_MMU_CONTEXT_H */ diff --git a/include/asm-x86/mmu_context_32.h b/include/asm-x86/mmu_context_32.h index 9756ae0..824fc57 100644 --- a/include/asm-x86/mmu_context_32.h +++ b/include/asm-x86/mmu_context_32.h @@ -1,28 +1,6 @@ #ifndef __I386_SCHED_H #define __I386_SCHED_H -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/paravirt.h> -#ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> - -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) -{ -} -#endif /* !CONFIG_PARAVIRT */ - - -/* - * Used for LDT copy/destruction. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP @@ -75,10 +53,4 @@ static inline void switch_mm(struct mm_struct *prev, #define deactivate_mm(tsk, mm) \ asm("movl %0,%%gs": :"r" (0)); -#define activate_mm(prev, next) \ -do { \ - paravirt_activate_mm((prev), (next)); \ - switch_mm((prev), (next), NULL); \ -} while (0); - #endif diff --git a/include/asm-x86/mmu_context_64.h b/include/asm-x86/mmu_context_64.h index ca44c71..c700063 100644 --- a/include/asm-x86/mmu_context_64.h +++ b/include/asm-x86/mmu_context_64.h @@ -1,21 +1,7 @@ #ifndef __X86_64_MMU_CONTEXT_H #define __X86_64_MMU_CONTEXT_H -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/pgalloc.h> #include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> -#endif - -/* - * possibly do the LDT unload here? - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { @@ -65,8 +51,4 @@ do { \ asm volatile("movl %0,%%fs"::"r"(0)); \ } while (0) -#define activate_mm(prev, next) \ - switch_mm((prev), (next), NULL) - - #endif -- cgit v1.1 From 4e29684c40f2a332ba4d05f6482d5807725d5624 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 12:18:14 -0700 Subject: x86: introduce init_memory_mapping for 32bit #1 ... so can we use mem below max_low_pfn earlier. this allows us to move several functions more early instead of waiting to after paging_init. That includes moving relocate_initrd() earlier in the bootup, and kva related early setup done in initmem_init. (in followup patches) Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 10 ++-- arch/x86/mm/init_32.c | 141 +++++++++++++++++++++++++++++++++++---------- include/asm-x86/page_32.h | 2 + 3 files changed, 120 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index bba8d57..03007ca 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -226,10 +226,8 @@ static void __init reserve_initrd(void) } /* We need to move the initrd down into lowmem */ - ramdisk_target = max_pfn_mapped<<PAGE_SHIFT; - ramdisk_here = find_e820_area(min(ramdisk_target, end_of_lowmem>>1), - end_of_lowmem, ramdisk_size, - PAGE_SIZE); + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + PAGE_SIZE); if (ramdisk_here == -1ULL) panic("Cannot find place for new RAMDISK of size %lld\n", @@ -433,8 +431,12 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820_end_of_ram(); } + /* max_low_pfn get updated here */ find_low_pfn_range(); + /* max_pfn_mapped is updated here */ + init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + reserve_initrd(); dmi_scan_machine(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 20ca295..619058e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -57,6 +57,27 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); + +static unsigned long __initdata table_start; +static unsigned long __meminitdata table_end; +static unsigned long __meminitdata table_top; + +static int __initdata after_init_bootmem; + +static __init void *alloc_low_page(unsigned long *phys) +{ + unsigned long pfn = table_end++; + void *adr; + + if (pfn >= table_top) + panic("alloc_low_page: ran out of memory"); + + adr = __va(pfn * PAGE_SIZE); + memset(adr, 0, PAGE_SIZE); + *phys = pfn * PAGE_SIZE; + return adr; +} + /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry @@ -68,9 +89,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE + unsigned long phys; if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - + if (after_init_bootmem) + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + else + pmd_table = (pmd_t *)alloc_low_page(&phys); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -92,12 +116,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; + if (after_init_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) { - page_table = + if (!page_table) + page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } else { + unsigned long phys; + page_table = (pte_t *)alloc_low_page(&phys); } paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); @@ -155,7 +183,9 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) +static void __init kernel_physical_mapping_init(pgd_t *pgd_base, + unsigned long start, + unsigned long end) { int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; @@ -163,18 +193,19 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd_t *pmd; pte_t *pte; unsigned pages_2m = 0, pages_4k = 0; + unsigned limit_pfn = end >> PAGE_SHIFT; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; - pfn = 0; + pfn = start >> PAGE_SHIFT; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) + if (pfn >= limit_pfn) continue; for (pmd_idx = 0; - pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd_idx < PTRS_PER_PMD && pfn < limit_pfn; pmd++, pmd_idx++) { unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; @@ -418,20 +449,7 @@ static void __init pagetable_init(void) paravirt_pagetable_setup_start(pgd_base); - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; - } - - kernel_physical_mapping_init(pgd_base); remap_numa_kva(); - /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -703,6 +721,7 @@ void __init setup_bootmem_allocator(void) free_bootmem_with_active_regions(i, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); + after_init_bootmem = 1; } /* @@ -723,6 +742,77 @@ static void __init remapped_pgdat_init(void) } } +static void __init find_early_table_space(unsigned long end) +{ + unsigned long puds, pmds, tables, start; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = PAGE_ALIGN(puds * sizeof(pud_t)); + + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, + tables, PAGE_SIZE); + if (table_start == -1UL) + panic("Cannot find space for the kernel page tables"); + + table_start >>= PAGE_SHIFT; + table_end = table_start; + table_top = table_start + (tables>>PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); +} + +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + pgd_t *pgd_base = swapper_pg_dir; + + /* + * Find space for the kernel direct mapping tables. + */ + if (!after_init_bootmem) + find_early_table_space(end); + +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + } + + kernel_physical_mapping_init(pgd_base, start, end); + + load_cr3(swapper_pg_dir); + + __flush_tlb_all(); + + if (!after_init_bootmem) + reserve_early(table_start << PAGE_SHIFT, + table_end << PAGE_SHIFT, "PGTABLE"); + + return end >> PAGE_SHIFT; +} + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. @@ -732,15 +822,8 @@ static void __init remapped_pgdat_init(void) */ void __init paging_init(void) { -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif pagetable_init(); - load_cr3(swapper_pg_dir); - __flush_tlb_all(); kmap_init(); diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 3810d14..4ae1dab 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -93,6 +93,8 @@ extern int sysctl_legacy_va_layout; #define MAXMEM (-__PAGE_OFFSET - __VMALLOC_RESERVE) extern void find_low_pfn_range(void); +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); extern void initmem_init(unsigned long, unsigned long); extern void zone_sizes_init(void); extern void setup_bootmem_allocator(void); -- cgit v1.1 From cfb0e53b05402f1ce65053677409a819c1798d34 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 12:18:58 -0700 Subject: x86: introduce init_memory_mapping for 32bit #2 moving relocate_initrd early Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 22 +++++++--------------- arch/x86/mm/init_32.c | 3 --- include/asm-x86/setup.h | 1 - 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 03007ca..4b953dc 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -186,7 +186,7 @@ static inline void copy_edd(void) #ifdef CONFIG_BLK_DEV_INITRD -static bool do_relocate_initrd = false; +static void __init relocate_initrd(void); static void __init reserve_initrd(void) { @@ -195,7 +195,6 @@ static void __init reserve_initrd(void) u64 ramdisk_end = ramdisk_image + ramdisk_size; u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; u64 ramdisk_here; - u64 ramdisk_target; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -242,12 +241,12 @@ static void __init reserve_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", ramdisk_here, ramdisk_here + ramdisk_size); - do_relocate_initrd = true; + relocate_initrd(); } #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -void __init post_reserve_initrd(void) +static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; @@ -256,9 +255,6 @@ void __init post_reserve_initrd(void) unsigned long slop, clen, mapaddr; char *p, *q; - if (!do_relocate_initrd) - return; - ramdisk_here = initrd_start - PAGE_OFFSET; q = (char *)initrd_start; @@ -269,10 +265,6 @@ void __init post_reserve_initrd(void) p = (char *)__va(ramdisk_image); memcpy(q, p, clen); q += clen; - /* need to free these low pages...*/ - printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n", - ramdisk_image, ramdisk_image + clen - 1); - free_bootmem(ramdisk_image, clen); ramdisk_image += clen; ramdisk_size -= clen; } @@ -298,16 +290,16 @@ void __init post_reserve_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); - /* need to free that, otherwise init highmem will reserve it again */ + /* + * need to free old one, otherwise init cross max_low_pfn could be + * converted to bootmem + */ free_early(ramdisk_image, ramdisk_image+ramdisk_size); } #else void __init reserve_initrd(void) { } -void __init post_reserve_initrd(void) -{ -} #endif /* CONFIG_BLK_DEV_INITRD */ #ifdef CONFIG_MCA diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 619058e..ecccb05 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -831,9 +831,6 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ - - post_reserve_initrd(); - remapped_pgdat_init(); sparse_init(); zone_sizes_init(); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 4ebb4ef..bb12a16 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -39,7 +39,6 @@ void reserve_crashkernel(void); #include <asm/bootparam.h> void reserve_standard_io_resources(void); -extern void post_reserve_initrd(void); #ifndef _SETUP -- cgit v1.1 From 3a58a2a6c879b2e47daafd6e641661c50ac9da5a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 12:19:41 -0700 Subject: x86: introduce init_memory_mapping for 32bit #3 move kva related early backto initmem_init for numa32 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 8 ++++++-- arch/x86/mm/init_32.c | 20 -------------------- include/asm-x86/numa_32.h | 5 ----- 3 files changed, 6 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index f5ae319..42484d4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -void __init remap_numa_kva(void) +static void __init remap_numa_kva(void) { void *vaddr; unsigned long pfn; @@ -373,12 +373,16 @@ void __init initmem_init(unsigned long start_pfn, allocate_pgdat(nid); } + remap_numa_kva(); + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) propagate_e820_map_node(nid); - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ecccb05..c059c46 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -449,7 +449,6 @@ static void __init pagetable_init(void) paravirt_pagetable_setup_start(pgd_base); - remap_numa_kva(); /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -724,24 +723,6 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -/* - * The node 0 pgdat is initialized before all of these because - * it's needed for bootmem. node>0 pgdats have their virtual - * space allocated before the pagetables are in place to access - * them, so they can't be cleared then. - * - * This should all compile down to nothing when NUMA is off. - */ -static void __init remapped_pgdat_init(void) -{ - int nid; - - for_each_online_node(nid) { - if (nid != 0) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - } -} - static void __init find_early_table_space(unsigned long end) { unsigned long puds, pmds, tables, start; @@ -831,7 +812,6 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ - remapped_pgdat_init(); sparse_init(); zone_sizes_init(); diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h index ed6c88e..220d7b7 100644 --- a/include/asm-x86/numa_32.h +++ b/include/asm-x86/numa_32.h @@ -5,12 +5,7 @@ extern int pxm_to_nid(int pxm); extern void numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); extern void set_highmem_pages_init(void); -#else -static inline void remap_numa_kva(void) -{ -} #endif #endif /* _ASM_X86_32_NUMA_H */ -- cgit v1.1 From 976dd4dc99c3eaf45e3802ed46e3cc06a1ad8689 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 14:55:32 -0700 Subject: x86: fix e820_update_range size when overlapping before that we relay on sanitize_e820_map to remove the overlap. but e820_update_range(,,E820_RESERVED, E820_RAM) will not work this patch fix that who is going to use this? Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 512f779..15b4393 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -425,6 +425,11 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, e820_add_region(final_start, final_end - final_start, new_type); real_updated_size += final_end - final_start; + + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; } return real_updated_size; } -- cgit v1.1 From 1a0db38e5fa50eeb885194b1f4e494d4de55b918 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 14:56:20 -0700 Subject: x86: get max_pfn_mapped in init_memory_mapping so don't shift that in the loop Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9b60491..359e3af 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -346,7 +346,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); - return last_map_addr >> PAGE_SHIFT; + return last_map_addr; } static void __init find_early_table_space(unsigned long end) @@ -556,7 +556,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon if (!after_bootmem) early_memtest(start_phys, end_phys); - return last_map_addr; + return last_map_addr >> PAGE_SHIFT; } #ifndef CONFIG_NUMA -- cgit v1.1 From d86623a0d55a14e7295e8ca1bd258e0c7f8dcb31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 14:57:29 -0700 Subject: x86: add table_top check for alloc_low_page in 64 bit that range is from find_e820_area, so don't try to use end_pfn to see if out of boundary...use table_top instead to avoid possible strange result while cross the boundary... also change early_printk to printk, because init_memory_mapping is after early param parsing, and console=uart8250 already working at that time. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 359e3af..6eced2f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -227,6 +227,7 @@ void __init cleanup_highmap(void) static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; +static unsigned long __meminitdata table_top; static __meminit void *alloc_low_page(unsigned long *phys) { @@ -240,7 +241,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= end_pfn) + if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); @@ -372,10 +373,10 @@ static void __init find_early_table_space(unsigned long end) table_start >>= PAGE_SHIFT; table_end = table_start; + table_top = table_start + (tables >> PAGE_SHIFT); - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); } static void __init init_gbpages(void) -- cgit v1.1 From 232b957ae93973a5f8619ef61b916744b747478c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 14:58:38 -0700 Subject: x86: change size if e820_update/remove_range in case someone using crazy parameter while calling them. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 15b4393..1b76b25 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -405,6 +405,9 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, BUG_ON(old_type == new_type); + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; @@ -441,6 +444,9 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int i; u64 real_removed_size = 0; + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; -- cgit v1.1 From f47f9d538ecc938bed589e9d39ad7b454f3b506c Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 22:13:15 -0700 Subject: x86: numa 32 using apicid_2_node to get node for logical_apicid Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/mach-bigsmp/mach_apic.h | 2 +- include/asm-x86/mach-default/mach_apic.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/mach-bigsmp/mach_apic.h index 8327907..017c8c1 100644 --- a/include/asm-x86/mach-bigsmp/mach_apic.h +++ b/include/asm-x86/mach-bigsmp/mach_apic.h @@ -81,7 +81,7 @@ static inline int multi_timer_check(int apic, int irq) static inline int apicid_to_node(int logical_apicid) { - return (0); + return apicid_2_node[hard_smp_processor_id()]; } static inline int cpu_present_to_apicid(int mps_cpu) diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h index 21003b5..0b2cde5 100644 --- a/include/asm-x86/mach-default/mach_apic.h +++ b/include/asm-x86/mach-default/mach_apic.h @@ -77,7 +77,11 @@ static inline void setup_apic_routing(void) static inline int apicid_to_node(int logical_apicid) { +#ifdef CONFIG_SMP + return apicid_2_node[hard_smp_processor_id()]; +#else return 0; +#endif } #endif -- cgit v1.1 From c987d12f8455b19b3b057d63bac3de161bd809fc Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 24 Jun 2008 22:14:09 -0700 Subject: x86: remove end_pfn in 64bit and use max_pfn directly. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/aperture_64.c | 6 ++++-- arch/x86/kernel/e820.c | 2 +- arch/x86/kernel/early-quirks.c | 2 +- arch/x86/kernel/machine_kexec_64.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 4 ++-- arch/x86/kernel/pci-dma.c | 4 ++-- arch/x86/kernel/pci-gart_64.c | 4 ++-- arch/x86/kernel/pci-swiotlb_64.c | 2 +- arch/x86/mm/init_64.c | 17 ++++++----------- arch/x86/mm/k8topology_64.c | 4 ++-- arch/x86/mm/numa_64.c | 4 ++-- arch/x86/mm/srat_64.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- include/asm-x86/page_64.h | 5 +++-- 14 files changed, 29 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 600470d..9f90780 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -407,7 +407,9 @@ void __init gart_iommu_hole_init(void) agp_aper_base == aper_base && agp_aper_order == aper_order) { /* the same between two setting from NB and agp */ - if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) { + if (!no_iommu && + max_pfn > MAX_DMA32_PFN && + !printed_gart_size_msg) { printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); printk(KERN_ERR "please increase GART size in your BIOS setup\n"); printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); @@ -448,7 +450,7 @@ out: /* Got the aperture from the AGP bridge */ } else if (swiotlb && !valid_agp) { /* Do nothing */ - } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || fallback_aper_force) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 1b76b25..3900ff5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -527,7 +527,7 @@ __init void e820_setup_gap(void) #ifdef CONFIG_X86_64 if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " "address range\n" KERN_ERR "PCI: Unassigned devices with 32bit resource " diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 84fd9f2..a4665f3 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -50,7 +50,7 @@ static void __init fix_hypertransport_config(int num, int slot, int func) static void __init via_bugs(int num, int slot, int func) { #ifdef CONFIG_GART_IOMMU - if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + if ((max_pfn > MAX_DMA32_PFN || force_iommu) && !gart_iommu_aperture_allowed) { printk(KERN_INFO "Looks like a VIA chipset. Disabling IOMMU." diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 576a03d..7830dc4 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -110,7 +110,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { pgd_t *level4p; level4p = (pgd_t *)__va(start_pgtable); - return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); + return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); } static void set_idt(void *newidt, u16 limit) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index e28ec49..6959b5c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1394,7 +1394,7 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1459,7 +1459,7 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (end_pfn > MAX_DMA32_PFN) + if (max_pfn > MAX_DMA32_PFN) printk(KERN_ERR "WARNING more than 4GB of memory, " "32bit PCI may malfunction.\n"); return ret; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cb0bdf4..8467ec2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -75,7 +75,7 @@ early_param("dma32_size", parse_dma32_size_opt); void __init dma32_reserve_bootmem(void) { unsigned long size, align; - if (end_pfn <= MAX_DMA32_PFN) + if (max_pfn <= MAX_DMA32_PFN) return; /* @@ -94,7 +94,7 @@ void __init dma32_reserve_bootmem(void) static void __init dma32_free_bootmem(void) { - if (end_pfn <= MAX_DMA32_PFN) + if (max_pfn <= MAX_DMA32_PFN) return; if (!dma32_bootmem_ptr) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 021f3c6..d0d18db 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -751,10 +751,10 @@ void __init gart_iommu_init(void) return; if (no_iommu || - (!force_iommu && end_pfn <= MAX_DMA32_PFN) || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { - if (end_pfn > MAX_DMA32_PFN) { + if (max_pfn > MAX_DMA32_PFN) { printk(KERN_WARNING "More than 4GB of memory " "but GART IOMMU not available.\n" KERN_WARNING "falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 490da7f..82299cd 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -38,7 +38,7 @@ const struct dma_mapping_ops swiotlb_dma_ops = { void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) + if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; if (swiotlb_force) swiotlb = 1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6eced2f..d5d4b04 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -49,11 +49,6 @@ #include <asm/cacheflush.h> /* - * PFN of last memory page. - */ -unsigned long end_pfn; - -/* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. @@ -586,9 +581,9 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; + max_zone_pfns[ZONE_NORMAL] = max_pfn; - memory_present(0, 0, end_pfn); + memory_present(0, 0, max_pfn); sparse_init(); free_area_init_nodes(max_zone_pfns); } @@ -670,8 +665,8 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - reservedpages = end_pfn - totalram_pages - - absent_pages_in_range(0, end_pfn); + reservedpages = max_pfn - totalram_pages - + absent_pages_in_range(0, max_pfn); after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -690,7 +685,7 @@ void __init mem_init(void) printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), + max_pfn << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, @@ -784,7 +779,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, #endif unsigned long pfn = phys >> PAGE_SHIFT; - if (pfn >= end_pfn) { + if (pfn >= max_pfn) { /* * This can happen with kdump kernels when accessing * firmware tables: diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 317573e..41f1b5c 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -143,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit |= (1<<24)-1; limit++; - if (limit > end_pfn << PAGE_SHIFT) - limit = end_pfn << PAGE_SHIFT; + if (limit > max_pfn << PAGE_SHIFT) + limit = max_pfn << PAGE_SHIFT; if (limit <= base) continue; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 316e5f9..b432d57 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -86,7 +86,7 @@ static int __init allocate_cachealigned_memnodemap(void) addr = 0x8000; nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, + nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, nodemap_size, L1_CACHE_BYTES); if (nodemap_addr == -1UL) { printk(KERN_ERR @@ -579,7 +579,7 @@ void __init paging_init(void) memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; + max_zone_pfns[ZONE_NORMAL] = max_pfn; sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index b67f5a1..0fd67b8 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -299,7 +299,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = end_pfn - absent_pages_in_range(0, end_pfn); + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index b542355..6dd000d 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -83,7 +83,7 @@ static int set_up_temporary_mappings(void) /* Set up the direct mapping from scratch */ start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); + end = (unsigned long)pfn_to_kaddr(max_pfn); for (; start < end; start = next) { pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index ac37643..cac5b9e 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -58,7 +58,8 @@ void clear_page(void *page); void copy_page(void *to, void *from); -extern unsigned long end_pfn; +/* duplicated to the one in bootmem.h */ +extern unsigned long max_pfn; extern unsigned long phys_base; extern unsigned long __phys_addr(unsigned long); @@ -87,7 +88,7 @@ extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < end_pfn) +#define pfn_valid(pfn) ((pfn) < max_pfn) #endif -- cgit v1.1 From 3381959da5a00ae8289cfbd28b0b6d228f2d1d46 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 24 Jun 2008 11:48:30 -0700 Subject: x86: cleanup e820_setup_gap(), add e820_search_gap(), v2 This is a preparatory patch for the next patch in series. Moves some code from e820_setup_gap to a new function e820_search_gap. This patch is a part of a bug fix where we walk the ACPI table to calculate a gap for PCI optional devices. v1->v2: Patch on top of tip/master. Fixes a bug introduced in the last patch about the typeof "last". Also the new function e820_search_gap now returns if we found a gap in e820_map. Signed-off-by: Alok N Kataria <akataria@vmware.com> Cc: lenb@kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 43 ++++++++++++++++++++++++++++--------------- include/asm-x86/e820.h | 2 ++ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3900ff5..22cfd66 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -488,26 +488,22 @@ void __init update_e820(void) } /* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. + * Search for a gap in the e820 memory space from start_addr to 2^32. */ -__init void e820_setup_gap(void) +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr) { - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; + unsigned long long last = 0x100000000ull; + int i = e820.nr_map; int found = 0; - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; while (--i >= 0) { unsigned long long start = e820.map[i].addr; unsigned long long end = start + e820.map[i].size; + if (end < start_addr) + continue; + /* * Since "last" is at most 4GB, we know we'll * fit in 32 bits if this condition is true @@ -515,15 +511,32 @@ __init void e820_setup_gap(void) if (last > end) { unsigned long gap = last - end; - if (gap > gapsize) { - gapsize = gap; - gapstart = end; + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; found = 1; } } if (start < last) last = start; } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize, round; + int found; + + gapstart = 0x10000000; + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize, 0); #ifdef CONFIG_X86_64 if (!found) { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 13fa5a0..f622685 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -71,6 +71,8 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype); extern void update_e820(void); extern void e820_setup_gap(void); +extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr); struct setup_data; extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); -- cgit v1.1 From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 25 Jun 2008 05:44:40 -0700 Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn addresses Fix some problems with (and applies on top of) a previous patch: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Primarily change "0x%8lx" format, which displays with a right aligned space filled hex number (spaces between the "0x" prefix and the number), into "%0#10lx" format, which zero fills instead of space fills, and which uses the printf flag '#' to request the "0x" prefix instead of hard coding it. Also replace some other "0x%lx" formats with "%#lx", making use of the '#' printf flag again. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 2 +- mm/page_alloc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22cfd66..1dcb665 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1014,7 +1014,7 @@ unsigned long __init e820_end_of_ram(void) if (last_pfn > end_user_pfn) last_pfn = end_user_pfn; - printk(KERN_INFO "last_pfn = 0x%lx max_arch_pfn = 0x%lx\n", + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b604c64..f024b9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " + printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s 0x%8lx -> 0x%8lx\n", + printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.1 From 200001eb140ea33477965f2050bea0dac801974b Mon Sep 17 00:00:00 2001 From: Paul Jackson <pj@sgi.com> Date: Wed, 25 Jun 2008 05:44:46 -0700 Subject: x86 boot: only pick up additional EFI memmap if add_efi_memmap flag Applies on top of the previous patch: x86 boot: add code to add BIOS provided EFI memory entries to kernel Instead of always adding EFI memory map entries (if present) to the memory map after initially finding either E820 BIOS memory map entries and/or kernel command line memmap entries, -instead- only add such additional EFI memory map entries if the kernel boot option: add_efi_memmap is specified. Requiring this 'add_efi_memmap' option is backward compatible with kernels that didn't load such additional EFI memory map entries in the first place, and it doesn't override a configuration that tries to replace all E820 or EFI BIOS memory map entries with ones given entirely on the kernel command line. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: "Yinghai Lu" <yhlu.kernel@gmail.com> Cc: "Jack Steiner" <steiner@sgi.com> Cc: "Mike Travis" <travis@sgi.com> Cc: "Huang Cc: Ying" <ying.huang@intel.com> Cc: "Andi Kleen" <andi@firstfloor.org> Cc: "Andrew Morton" <akpm@linux-foundation.org> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/kernel-parameters.txt | 3 +++ Documentation/x86/x86_64/uefi.txt | 4 ++++ arch/x86/kernel/efi.c | 16 ++++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index da13d6d..795c487 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2150,6 +2150,9 @@ and is between 256 and 4096 characters. It is defined in the file usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. + add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in + kernel's map of available physical RAM. + vdso= [X86-32,SH,x86-64] vdso=2: enable compat VDSO (default with COMPAT_VDSO) vdso=1: enable VDSO (default) diff --git a/Documentation/x86/x86_64/uefi.txt b/Documentation/x86/x86_64/uefi.txt index 7d77120..a5e2b4f 100644 --- a/Documentation/x86/x86_64/uefi.txt +++ b/Documentation/x86/x86_64/uefi.txt @@ -36,3 +36,7 @@ Mechanics: services. noefi turn off all EFI runtime services reboot_type=k turn off EFI reboot runtime service +- If the EFI memory map has additional entries not in the E820 map, + you can include those entries in the kernels memory map of available + physical RAM by using the following kernel command line parameter. + add_efi_memmap include EFI memory map of available physical RAM diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index a03ca36..94382fa 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -64,6 +64,17 @@ static int __init setup_noefi(char *arg) } early_param("noefi", setup_noefi); +int add_efi_memmap; +EXPORT_SYMBOL(add_efi_memmap); + +static int __init setup_add_efi_memmap(char *arg) +{ + add_efi_memmap = 1; + return 0; +} +early_param("add_efi_memmap", setup_add_efi_memmap); + + static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { return efi_call_virt2(get_time, tm, tc); @@ -219,7 +230,7 @@ unsigned long efi_get_time(void) * (zeropage) memory map. */ -static void __init add_efi_memmap(void) +static void __init do_add_efi_memmap(void) { void *p; @@ -406,7 +417,8 @@ void __init efi_init(void) if (memmap.desc_size != sizeof(efi_memory_desc_t)) printk(KERN_WARNING "Kernel-defined memdesc" "doesn't match the one from EFI!\n"); - add_efi_memmap(); + if (add_efi_memmap) + do_add_efi_memmap(); /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; -- cgit v1.1 From 383bc5cecc2ed0b8f44a25488660b03030425ef7 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Thu, 26 Jun 2008 09:23:21 +0200 Subject: x86, crashdump, /proc/vmcore: remove CONFIG_EXPERIMENTAL from kdump I would suggest to remove the "experimental" status from Kdump. Kdump is now in the kernel since a long time and used by Enterprise distributions. I don't think that "experimental" is true any more. Signed-off-by: Bernhard Walle <bwalle@suse.de> Cc: vgoyal@redhat.com Cc: kexec@lists.infradead.org Cc: Bernhard Walle <bwalle@suse.de> Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 1 - fs/Kconfig | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 112afd3..25ff754 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1269,7 +1269,6 @@ config KEXEC config CRASH_DUMP bool "kernel crash dumps (EXPERIMENTAL)" - depends on EXPERIMENTAL depends on X86_64 || (X86_32 && HIGHMEM) help Generate crash dump after being started by kexec. diff --git a/fs/Kconfig b/fs/Kconfig index 2694648..313b2e0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -930,7 +930,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + depends on PROC_FS && CRASH_DUMP default y help Exports the dump image of crashed kernel in ELF format. -- cgit v1.1 From b9d19f4a51447930db002409945ad40a7a373cb0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 26 Jun 2008 00:44:56 -0700 Subject: x86: fix memory setup bug interesting... [ 0.000000] mapped low ram: 0 - 20000000 [ 0.000000] low ram: 00000000 - 1fff0000 [ 0.000000] bootmap 00002000 - 00006000 max_pfn_mapped > max_low_pfn? it seems init_memory_mapping reveals an old bug. please check attached test patch. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 4b953dc..392812d 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -427,7 +427,7 @@ void __init setup_arch(char **cmdline_p) find_low_pfn_range(); /* max_pfn_mapped is updated here */ - init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); reserve_initrd(); -- cgit v1.1 From 378b39a4f91ab0846eb6e13d47ea812bc82b44d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:48:14 -0700 Subject: x86: rename setup.c to setup_percpu.c some functions need to be moved to setup_numa.c after we merge setup32/64.c, some funcs need to be moved back to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/setup.c | 528 ----------------------------------------- arch/x86/kernel/setup_percpu.c | 528 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 529 insertions(+), 529 deletions(-) delete mode 100644 arch/x86/kernel/setup.c create mode 100644 arch/x86/kernel/setup_percpu.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0a1987b..5e1537b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o +obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c deleted file mode 100644 index 5c0c4bb..0000000 --- a/arch/x86/kernel/setup.c +++ /dev/null @@ -1,528 +0,0 @@ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/percpu.h> -#include <linux/kexec.h> -#include <linux/crash_dump.h> -#include <asm/smp.h> -#include <asm/percpu.h> -#include <asm/sections.h> -#include <asm/processor.h> -#include <asm/setup.h> -#include <asm/topology.h> -#include <asm/mpspec.h> -#include <asm/apicdef.h> -#include <asm/highmem.h> - -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -unsigned int max_physical_apicid; -EXPORT_SYMBOL(boot_cpu_physical_apicid); - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; -#endif - -/* map cpu index to physical APIC ID */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define X86_64_NUMA 1 - -/* map cpu index to node index */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* which logical CPUs are on which nodes */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); - -/* setup node_to_cpumask_map */ -static void __init setup_node_to_cpumask_map(void); - -#else -static inline void setup_node_to_cpumask_map(void) { } -#endif - -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) -/* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas. These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. - */ -static void __init setup_per_cpu_maps(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif - } - - /* indicate the early static arrays will soon be gone */ - early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA - early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; -#endif -} - -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - -#ifdef CONFIG_X86_32 -/* - * Great future not-so-futuristic plan: make i386 and x86_64 do it - * the same way - */ -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(__per_cpu_offset); -static inline void setup_cpu_pda_map(void) { } - -#elif !defined(CONFIG_SMP) -static inline void setup_cpu_pda_map(void) { } - -#else /* CONFIG_SMP && CONFIG_X86_64 */ - -/* - * Allocate cpu_pda pointer table and array via alloc_bootmem. - */ -static void __init setup_cpu_pda_map(void) -{ - char *pda; - struct x8664_pda **new_cpu_pda; - unsigned long size; - int cpu; - - size = roundup(sizeof(struct x8664_pda), cache_line_size()); - - /* allocate cpu_pda array and pointer table */ - { - unsigned long tsize = nr_cpu_ids * sizeof(void *); - unsigned long asize = size * (nr_cpu_ids - 1); - - tsize = roundup(tsize, cache_line_size()); - new_cpu_pda = alloc_bootmem(tsize + asize); - pda = (char *)new_cpu_pda + tsize; - } - - /* initialize pointer table to static pda's */ - for_each_possible_cpu(cpu) { - if (cpu == 0) { - /* leave boot cpu pda in place */ - new_cpu_pda[0] = cpu_pda(0); - continue; - } - new_cpu_pda[cpu] = (struct x8664_pda *)pda; - new_cpu_pda[cpu]->in_bootmem = 1; - pda += size; - } - - /* point to new pointer table */ - _cpu_pda = new_cpu_pda; -} -#endif - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - ssize_t size = PERCPU_ENOUGH_ROOM; - char *ptr; - int cpu; - - /* no processor from mptable or madt */ - if (!num_processors) - num_processors = 1; - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#else - nr_cpu_ids = num_processors; -#endif - - /* Setup cpu_pda map */ - setup_cpu_pda_map(); - - /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; - printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", - size); - - for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - printk(KERN_INFO - "cpu %d has no node %d or node-local memory\n", - cpu, node); - } - else - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); -#endif - per_cpu_offset(cpu) = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - - } - - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - - /* Setup percpu data maps */ - setup_per_cpu_maps(); - - /* Setup node to cpumask map */ - setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); -} - -#endif - -void __init parse_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); - switch (data->type) { - case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); - break; - default: - break; - } -#ifndef CONFIG_DEBUG_BOOT_PARAMS - free_early(pa_data, pa_data+sizeof(*data)+data->len); -#endif - pa_data = data->next; - early_iounmap(data, PAGE_SIZE); - } -} - -#ifdef X86_64_NUMA - -/* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. - * - * Note: node_to_cpumask() is not valid until after this is done. - */ -static void __init setup_node_to_cpumask_map(void) -{ - unsigned int node, num = 0; - cpumask_t *map; - - /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } - - /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - if (cpu_pda(cpu) && node != NUMA_NO_NODE) - cpu_pda(cpu)->nodenumber = node; - - if (cpu_to_node_map) - cpu_to_node_map[cpu] = node; - - else if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -void __cpuinit numa_add_cpu(int cpu) -{ - cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); -} - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - int node = cpu_to_node(cpu); - cpumask_t *mask; - char buf[64]; - - if (node_to_cpumask_map == NULL) { - printk(KERN_ERR "node_to_cpumask_map NULL\n"); - dump_stack(); - return; - } - - mask = &node_to_cpumask_map[node]; - if (enable) - cpu_set(cpu, *mask); - else - cpu_clear(cpu, *mask); - - cpulist_scnprintf(buf, sizeof(buf), *mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); - } - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} - -int cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!per_cpu_offset(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -cpumask_t *_node_to_cpumask_ptr(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return &cpu_online_map; - } - BUG_ON(node >= nr_node_ids); - return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(_node_to_cpumask_ptr); - -/* - * Returns a bitmask of CPUs on Node 'node'. - */ -cpumask_t node_to_cpumask(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); - dump_stack(); - return cpu_online_map; - } - BUG_ON(node >= nr_node_ids); - return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ - -#endif /* X86_64_NUMA */ - - -/* - * --------- Crashkernel reservation ------------------------------ - */ - -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif - - return total << PAGE_SHIFT; -} - -#ifdef CONFIG_KEXEC -void __init reserve_crashkernel(void) -{ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; - int ret; - - total_mem = get_total_mem(); - - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); - return; - } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); - return; - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } -} -#else -void __init reserve_crashkernel(void) -{} -#endif -static struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x60, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x64, .end = 0x64, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -void __init reserve_standard_io_resources(void) -{ - int i; - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - -} - -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - - - diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c new file mode 100644 index 0000000..5c0c4bb --- /dev/null +++ b/arch/x86/kernel/setup_percpu.c @@ -0,0 +1,528 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/percpu.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <asm/smp.h> +#include <asm/percpu.h> +#include <asm/sections.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/topology.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/highmem.h> + +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int num_processors; +unsigned disabled_cpus __cpuinitdata; +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +unsigned int max_physical_apicid; +EXPORT_SYMBOL(boot_cpu_physical_apicid); + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; +#endif + +/* map cpu index to physical APIC ID */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#define X86_64_NUMA 1 + +/* map cpu index to node index */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* which logical CPUs are on which nodes */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* setup node_to_cpumask_map */ +static void __init setup_node_to_cpumask_map(void); + +#else +static inline void setup_node_to_cpumask_map(void) { } +#endif + +#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) +/* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_bios_cpu_apicid, cpu) = + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif +} + +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +cpumask_t *cpumask_of_cpu_map __read_mostly; +EXPORT_SYMBOL(cpumask_of_cpu_map); + +/* requires nr_cpu_ids to be initialized */ +static void __init setup_cpumask_of_cpu(void) +{ + int i; + + /* alloc_bootmem zeroes memory */ + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); + for (i = 0; i < nr_cpu_ids; i++) + cpu_set(i, cpumask_of_cpu_map[i]); +} +#else +static inline void setup_cpumask_of_cpu(void) { } +#endif + +#ifdef CONFIG_X86_32 +/* + * Great future not-so-futuristic plan: make i386 and x86_64 do it + * the same way + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} +#endif + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; + + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#else + nr_cpu_ids = num_processors; +#endif + + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + + /* Copy section for each CPU (we discard the original) */ + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", + size); + + for_each_possible_cpu(cpu) { +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(size); +#else + int node = early_cpu_to_node(cpu); + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(size); + printk(KERN_INFO + "cpu %d has no node %d or node-local memory\n", + cpu, node); + } + else + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); +#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + + } + + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", + NR_CPUS, nr_cpu_ids, nr_node_ids); + + /* Setup percpu data maps */ + setup_per_cpu_maps(); + + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + + /* Setup cpumask_of_cpu map */ + setup_cpumask_of_cpu(); +} + +#endif + +void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + case SETUP_E820_EXT: + parse_e820_ext(data, pa_data); + break; + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} + +#ifdef X86_64_NUMA + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + + Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + if (cpu_pda(cpu) && node != NUMA_NO_NODE) + cpu_pda(cpu)->nodenumber = node; + + if (cpu_to_node_map) + cpu_to_node_map[cpu] = node; + + else if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), *mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); + } + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +cpumask_t *_node_to_cpumask_ptr(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return &cpu_online_map; + } + BUG_ON(node >= nr_node_ids); + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(_node_to_cpumask_ptr); + +/* + * Returns a bitmask of CPUs on Node 'node'. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + BUG_ON(node >= nr_node_ids); + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ + +#endif /* X86_64_NUMA */ + + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base <= 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + } +} +#else +void __init reserve_crashkernel(void) +{} +#endif +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); +#endif + + + -- cgit v1.1 From 08afc7c0dd8ecb04c7a6fe367102e410a74abbe6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:48:23 -0700 Subject: x86: we can use full bootmem after have init_memory_mapping So remove outdated comments Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 392812d..5e919e2 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -476,15 +476,6 @@ void __init setup_arch(char **cmdline_p) */ vmi_init(); #endif - /* - * NOTE: before this point _nobody_ is allowed to allocate - * any memory using the bootmem allocator. Although the - * allocator is now initialised only the first 8Mb of the kernel - * virtual address space has been mapped. All allocations before - * paging_init() has completed must use the alloc_bootmem_low_pages() - * variant (which allocates DMA'able memory) and care must be taken - * not to exceed the 8Mb limit. - */ paging_init(); -- cgit v1.1 From eb1379cb296f5aee348c2e04317d911bb84d9184 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:49:26 -0700 Subject: x86: update reserve_initrd to support 64bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 109 +++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5e919e2..aa81779 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -186,43 +186,18 @@ static inline void copy_edd(void) #ifdef CONFIG_BLK_DEV_INITRD -static void __init relocate_initrd(void); +#ifdef CONFIG_X86_32 -static void __init reserve_initrd(void) +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) +static void __init relocate_initrd(void) { + u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; u64 ramdisk_here; - - if (!boot_params.hdr.type_of_loader || - !ramdisk_image || !ramdisk_size) - return; /* No initrd provided by bootloader */ - - initrd_start = 0; - - if (ramdisk_size >= (end_of_lowmem>>1)) { - free_early(ramdisk_image, ramdisk_end); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; - } - - printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image, - ramdisk_end); - - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start + ramdisk_size; - return; - } + unsigned long slop, clen, mapaddr; + char *p, *q; /* We need to move the initrd down into lowmem */ ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, @@ -241,22 +216,6 @@ static void __init reserve_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", ramdisk_here, ramdisk_here + ramdisk_size); - relocate_initrd(); -} - -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) - -static void __init relocate_initrd(void) -{ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; - u64 ramdisk_here; - unsigned long slop, clen, mapaddr; - char *p, *q; - - ramdisk_here = initrd_start - PAGE_OFFSET; - q = (char *)initrd_start; /* Copy any lowmem portion of the initrd */ @@ -286,18 +245,60 @@ static void __init relocate_initrd(void) /* high pages is not converted by early_res_to_bootmem */ ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; - printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n", + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" + " %08llx - %08llx\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); +} +#endif - /* - * need to free old one, otherwise init cross max_low_pfn could be - * converted to bootmem - */ - free_early(ramdisk_image, ramdisk_image+ramdisk_size); +static void __init reserve_initrd(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + initrd_start = 0; + + if (ramdisk_size >= (end_of_lowmem>>1)) { + free_early(ramdisk_image, ramdisk_end); + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + + printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, + ramdisk_end); + + + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + /* + * don't need to reserve again, already reserved early + * in i386_start_kernel + */ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + return; + } + +#ifdef CONFIG_X86_32 + relocate_initrd(); +#else + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08llx > 0x%08llx)\ndisabling initrd\n", + ramdisk_end, end_of_lowmem); + initrd_start = 0; +#endif + free_early(ramdisk_image, ramdisk_end); } #else -void __init reserve_initrd(void) +static void __init reserve_initrd(void) { } #endif /* CONFIG_BLK_DEV_INITRD */ -- cgit v1.1 From 7dea23ecd17db6e42e19499db70d2fcfa5ca1ee2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:50:06 -0700 Subject: x86: put global variable for 32bit all together those variables are not needed by 64 bit. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 57 +++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index aa81779..98f9199 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -100,6 +100,8 @@ static struct resource bss_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; + +#ifdef CONFIG_X86_32 static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, @@ -108,24 +110,47 @@ static struct resource video_ram_resource = { }; /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; EXPORT_SYMBOL(boot_cpu_data); +static void set_mca_bus(int x) +{ +#ifdef CONFIG_MCA + MCA_bus = x; +#endif +} unsigned int def_to_bigsmp; -#ifndef CONFIG_X86_PAE -unsigned long mmu_cr4_features; -#else -unsigned long mmu_cr4_features = X86_CR4_PAE; -#endif - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; unsigned int machine_submodel_id; unsigned int BIOS_revision; +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +struct ist_info ist_info; +EXPORT_SYMBOL(ist_info); +#else +struct ist_info ist_info; +#endif + +#else +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); +#endif + + +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif + /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -140,15 +165,8 @@ char dmi_alloc_data[DMI_MAX_DATA]; */ struct screen_info screen_info; EXPORT_SYMBOL(screen_info); -struct apm_info apm_info; -EXPORT_SYMBOL(apm_info); struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); -struct ist_info ist_info; -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) -EXPORT_SYMBOL(ist_info); -#endif extern int root_mountflags; @@ -303,15 +321,6 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_MCA -static void set_mca_bus(int x) -{ - MCA_bus = x; -} -#else -static void set_mca_bus(int x) { } -#endif - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures -- cgit v1.1 From 46d671b525102ae005dc5e8389ca67c86ae012b1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:51:29 -0700 Subject: x86: add extra includes for 64bit support Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 50 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 98f9199..0b69483 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -45,31 +45,75 @@ #include <linux/dmi.h> #include <linux/pfn.h> #include <linux/pci.h> +#include <asm/pci-direct.h> #include <linux/init_ohci1394_dma.h> #include <linux/kvm_para.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/delay.h> +#include <linux/highmem.h> + +#include <linux/kallsyms.h> +#include <linux/edd.h> +#include <linux/iscsi_ibft.h> +#include <linux/kexec.h> +#include <linux/cpufreq.h> +#include <linux/dma-mapping.h> +#include <linux/ctype.h> +#include <linux/uaccess.h> + +#include <linux/percpu.h> +#include <linux/crash_dump.h> + #include <video/edid.h> #include <asm/mtrr.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> -#include <asm/mmzone.h> #include <asm/setup.h> #include <asm/arch_hooks.h> #include <asm/sections.h> #include <asm/dmi.h> #include <asm/io_apic.h> #include <asm/ist.h> -#include <asm/io.h> #include <asm/vmi.h> #include <setup_arch.h> #include <asm/bios_ebda.h> #include <asm/cacheflush.h> #include <asm/processor.h> -#include <asm/efi.h> #include <asm/bugs.h> +#include <asm/system.h> +#include <asm/vsyscall.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/dma.h> +#include <asm/gart.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> + +#include <mach_apic.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ARCH_SETUP +#endif + +#include <asm/percpu.h> +#include <asm/sections.h> +#include <asm/topology.h> +#include <asm/apicdef.h> +#ifdef CONFIG_X86_32 +#include <asm/highmem.h> +#endif + /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ -- cgit v1.1 From 76934ed4b33b65096296d3e7b3c046fd020019fc Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:52:35 -0700 Subject: x86: merge 64bit setup_arch into setup_32 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 99 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 0b69483..f3177ba 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -372,26 +372,38 @@ static void __init reserve_initrd(void) * global efi_enabled. This allows the same kernel image to be used on existing * systems (with a traditional BIOS) as well as on EFI systems. */ +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ + void __init setup_arch(char **cmdline_p) { +#ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); early_ioremap_init(); reserve_setup_data(); +#else + printk(KERN_INFO "Command line: %s\n", boot_command_line); +#endif ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; - saved_video_mode = boot_params.hdr.vid_mode; - if( boot_params.sys_desc_table.length != 0 ) { + if (boot_params.sys_desc_table.length != 0) { set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); machine_id = boot_params.sys_desc_table.table[0]; machine_submodel_id = boot_params.sys_desc_table.table[1]; BIOS_revision = boot_params.sys_desc_table.table[2]; } +#endif + saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; #ifdef CONFIG_BLK_DEV_RAM @@ -401,7 +413,12 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) { +#ifdef CONFIG_X86_32 + "EL32", +#else + "EL64", +#endif + 4)) { efi_enabled = 1; efi_reserve_early(); } @@ -417,7 +434,11 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; +#ifdef CONFIG_X86_32 init_mm.brk = init_pg_tables_end + PAGE_OFFSET; +#else + init_mm.brk = (unsigned long) &_end; +#endif code_resource.start = virt_to_phys(_text); code_resource.end = virt_to_phys(_etext)-1; @@ -426,6 +447,9 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; +#ifdef CONFIG_X86_64 + early_cpu_init(); +#endif strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; @@ -433,16 +457,27 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (acpi_mps_check()){ + if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_32 enable_local_apic = -1; +#else + disable_apic = 1; +#endif #endif clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); } finish_e820_parsing(); +#ifdef CONFIG_X86_32 probe_roms(); +#else +# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +# endif +#endif /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); @@ -452,6 +487,7 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); +#ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, E820_RESERVED); @@ -459,6 +495,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "fixed physical RAM map:\n"); e820_print_map("bad_ppro"); } +#else + early_gart_iommu_check(); +#endif e820_register_active_regions(0, 0, -1UL); /* @@ -477,14 +516,32 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820_end_of_ram(); } +#ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); +#else + num_physpages = max_pfn; + + check_efer(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ + max_low_pfn = max_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#endif /* max_pfn_mapped is updated here */ +#ifdef CONFIG_X86_64 + max_pfn_mapped = +#endif max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); reserve_initrd(); +#ifdef CONFIG_X86_64 + vsmp_init(); +#endif + dmi_scan_machine(); io_delay_init(); @@ -494,6 +551,11 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); +#ifdef CONFIG_X86_64 + /* Remove active ranges so rediscovery with NUMA-awareness happens */ + remove_all_active_ranges(); +#endif + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -503,13 +565,18 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); +#ifdef CONFIG_X86_64 + dma32_reserve_bootmem(); +#endif + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_FIND_SMP_CONFIG +#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \ + defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64) /* * Find and reserve possible boot-time SMP configuration: */ @@ -523,7 +590,7 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#ifdef CONFIG_VMI +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) /* * Must be after max_low_pfn is determined, and before kernel * pagetables are setup. @@ -533,11 +600,15 @@ void __init setup_arch(char **cmdline_p) paging_init(); +#ifdef CONFIG_X86_64 + map_vsyscall(); +#endif + /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. */ -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT +#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32) if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); #endif @@ -553,6 +624,10 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); +#ifdef CONFIG_X86_64 + init_cpu_to_node(); +#endif + #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: @@ -560,18 +635,26 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); #endif -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) + +#ifdef CONFIG_X86_64 + init_apic_mappings(); + ioapic_init_mappings(); +#else +# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) if (def_to_bigsmp) printk(KERN_WARNING "More than 8 CPUs detected and " "CONFIG_X86_PC cannot handle it.\nUse " "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); +# endif #endif kvm_guest_init(); e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); +#ifdef CONFIG_X86_32 request_resource(&iomem_resource, &video_ram_resource); +#endif reserve_standard_io_resources(); e820_setup_gap(); -- cgit v1.1 From f2f865fe6e6e40ddf37f887eb427263d83bb925d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:53:22 -0700 Subject: x86: space to tab in setup_arch Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index f3177ba..4d5f8a3 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -557,10 +557,10 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); #endif initmem_init(0, max_pfn); -- cgit v1.1 From 55f262391a2365d657a00ed68edd1a51bca66af5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:54:23 -0700 Subject: x86: rename setup_32.c to setup.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and let 64 bit use that instead of setup_64.c [ mingo@elte.hu ] x86: build fix fix: arch/x86/kernel/setup.c: In function ‘setup_arch': arch/x86/kernel/setup.c:561: error: implicit declaration of function ‘efi_reserve_early' and: arch/x86/kernel/setup.c:766: error: implicit declaration of function 'init_cpu_to_node' and: arch/x86/kernel/setup.c:676: warning: operation on 'max_pfn_mapped' may be undefined Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/setup.c | 672 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_32.c | 671 -------------------------------------------- 3 files changed, 673 insertions(+), 672 deletions(-) create mode 100644 arch/x86/kernel/setup.c delete mode 100644 arch/x86/kernel/setup_32.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e1537b..212804e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -18,7 +18,7 @@ CFLAGS_tsc_64.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o obj-y += traps_$(BITS).o irq_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o -obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup_percpu.o +obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c new file mode 100644 index 0000000..222d703 --- /dev/null +++ b/arch/x86/kernel/setup.c @@ -0,0 +1,672 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@mvista.com>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@osdl.org>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/screen_info.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/mca.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/iscsi_ibft.h> +#include <linux/nodemask.h> +#include <linux/kexec.h> +#include <linux/dmi.h> +#include <linux/pfn.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> +#include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/delay.h> +#include <linux/highmem.h> + +#include <linux/kallsyms.h> +#include <linux/edd.h> +#include <linux/iscsi_ibft.h> +#include <linux/kexec.h> +#include <linux/cpufreq.h> +#include <linux/dma-mapping.h> +#include <linux/ctype.h> +#include <linux/uaccess.h> + +#include <linux/percpu.h> +#include <linux/crash_dump.h> + +#include <video/edid.h> + +#include <asm/mtrr.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/arch_hooks.h> +#include <asm/efi.h> +#include <asm/sections.h> +#include <asm/dmi.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/vmi.h> +#include <setup_arch.h> +#include <asm/bios_ebda.h> +#include <asm/cacheflush.h> +#include <asm/processor.h> +#include <asm/bugs.h> + +#include <asm/system.h> +#include <asm/vsyscall.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/dma.h> +#include <asm/gart.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> + +#include <mach_apic.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ARCH_SETUP +#endif + +#include <asm/percpu.h> +#include <asm/sections.h> +#include <asm/topology.h> +#include <asm/apicdef.h> +#ifdef CONFIG_X86_64 +#include <asm/numa_64.h> +#endif +#ifdef CONFIG_X86_32 +#include <asm/highmem.h> +#endif + +/* This value is set up by the early boot code to point to the value + immediately after the boot time page tables. It contains a *physical* + address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_start __initdata = ~0UL; +unsigned long init_pg_tables_end __initdata = ~0UL; + +/* + * Machine setup.. + */ +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + + +#ifdef CONFIG_X86_32 +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +/* cpu data as detected by the assembly code in head.S */ +struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +/* common cpu data for all cpus */ +struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +EXPORT_SYMBOL(boot_cpu_data); +static void set_mca_bus(int x) +{ +#ifdef CONFIG_MCA + MCA_bus = x; +#endif +} + +unsigned int def_to_bigsmp; + +/* for MCA, but anyone else can use it if they want */ +unsigned int machine_id; +unsigned int machine_submodel_id; +unsigned int BIOS_revision; + +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +struct ist_info ist_info; +EXPORT_SYMBOL(ist_info); +#else +struct ist_info ist_info; +#endif + +#else +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); +#endif + + +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +/* + * Early DMI memory + */ +int dmi_alloc_index; +char dmi_alloc_data[DMI_MAX_DATA]; + +/* + * Setup options + */ +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); + +extern int root_mountflags; + +unsigned long saved_video_mode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char __initdata command_line[COMMAND_LINE_SIZE]; + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; +} +#else +static inline void copy_edd(void) +{ +} +#endif + +#ifdef CONFIG_BLK_DEV_INITRD + +#ifdef CONFIG_X86_32 + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) +static void __init relocate_initrd(void) +{ + + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 ramdisk_here; + unsigned long slop, clen, mapaddr; + char *p, *q; + + /* We need to move the initrd down into lowmem */ + ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + PAGE_SIZE); + + if (ramdisk_here == -1ULL) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + + /* Note: this includes all the lowmem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + "NEW RAMDISK"); + initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", + ramdisk_here, ramdisk_here + ramdisk_size); + + q = (char *)initrd_start; + + /* Copy any lowmem portion of the initrd */ + if (ramdisk_image < end_of_lowmem) { + clen = end_of_lowmem - ramdisk_image; + p = (char *)__va(ramdisk_image); + memcpy(q, p, clen); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + /* Copy the highmem portion of the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_ioremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + /* high pages is not converted by early_res_to_bootmem */ + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" + " %08llx - %08llx\n", + ramdisk_image, ramdisk_image + ramdisk_size - 1, + ramdisk_here, ramdisk_here + ramdisk_size - 1); +} +#endif + +static void __init reserve_initrd(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + initrd_start = 0; + + if (ramdisk_size >= (end_of_lowmem>>1)) { + free_early(ramdisk_image, ramdisk_end); + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + + printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, + ramdisk_end); + + + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + /* + * don't need to reserve again, already reserved early + * in i386_start_kernel + */ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + return; + } + +#ifdef CONFIG_X86_32 + relocate_initrd(); +#else + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08llx > 0x%08llx)\ndisabling initrd\n", + ramdisk_end, end_of_lowmem); + initrd_start = 0; +#endif + free_early(ramdisk_image, ramdisk_end); +} +#else +static void __init reserve_initrd(void) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ + +void __init setup_arch(char **cmdline_p) +{ +#ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + pre_setup_arch_hook(); + early_cpu_init(); + early_ioremap_init(); + reserve_setup_data(); +#else + printk(KERN_INFO "Command line: %s\n", boot_command_line); +#endif + + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; + if (boot_params.sys_desc_table.length != 0) { + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); + machine_id = boot_params.sys_desc_table.table[0]; + machine_submodel_id = boot_params.sys_desc_table.table[1]; + BIOS_revision = boot_params.sys_desc_table.table[2]; + } +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, +#ifdef CONFIG_X86_32 + "EL32", +#else + "EL64", +#endif + 4)) { + efi_enabled = 1; + efi_reserve_early(); + } +#endif + + ARCH_SETUP + + setup_memory_map(); + copy_edd(); + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; +#ifdef CONFIG_X86_32 + init_mm.brk = init_pg_tables_end + PAGE_OFFSET; +#else + init_mm.brk = (unsigned long) &_end; +#endif + + code_resource.start = virt_to_phys(_text); + code_resource.end = virt_to_phys(_etext)-1; + data_resource.start = virt_to_phys(_etext); + data_resource.end = virt_to_phys(_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; + +#ifdef CONFIG_X86_64 + early_cpu_init(); +#endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_setup_data(); + + parse_early_param(); + + if (acpi_mps_check()) { +#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_32 + enable_local_apic = -1; +#else + disable_apic = 1; +#endif +#endif + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + } + + finish_e820_parsing(); + +#ifdef CONFIG_X86_32 + probe_roms(); +#else +# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +# endif +#endif + + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); + + if (efi_enabled) + efi_init(); + +#ifdef CONFIG_X86_32 + if (ppro_with_ram_bug()) { + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, + E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820_print_map("bad_ppro"); + } +#else + early_gart_iommu_check(); +#endif + + e820_register_active_regions(0, 0, -1UL); + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + max_pfn = e820_end_of_ram(); + + /* preallocate 4k for mptable mpc */ + early_reserve_e820_mpc_new(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) { + remove_all_active_ranges(); + e820_register_active_regions(0, 0, -1UL); + max_pfn = e820_end_of_ram(); + } + +#ifdef CONFIG_X86_32 + /* max_low_pfn get updated here */ + find_low_pfn_range(); +#else + num_physpages = max_pfn; + + check_efer(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ + max_low_pfn = max_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#endif + + /* max_pfn_mapped is updated here */ + max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + + reserve_initrd(); + +#ifdef CONFIG_X86_64 + vsmp_init(); +#endif + + dmi_scan_machine(); + + io_delay_init(); + + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); + +#ifdef CONFIG_X86_64 + /* Remove active ranges so rediscovery with NUMA-awareness happens */ + remove_all_active_ranges(); +#endif + +#ifdef CONFIG_ACPI_NUMA + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); +#endif + + initmem_init(0, max_pfn); + +#ifdef CONFIG_X86_64 + dma32_reserve_bootmem(); +#endif + +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \ + defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64) + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + reserve_crashkernel(); + + reserve_ibft_region(); + +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be after max_low_pfn is determined, and before kernel + * pagetables are setup. + */ + vmi_init(); +#endif + + paging_init(); + +#ifdef CONFIG_X86_64 + map_vsyscall(); +#endif + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32) + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + +#ifdef CONFIG_X86_GENERICARCH + generic_apic_probe(); +#endif + + early_quirks(); + + /* + * Read APIC and some other early information from ACPI tables. + */ + acpi_boot_init(); + +#ifdef CONFIG_X86_64 + init_cpu_to_node(); +#endif + +#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + get_smp_config(); +#endif + +#ifdef CONFIG_X86_64 + init_apic_mappings(); + ioapic_init_mappings(); +#else +# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) + if (def_to_bigsmp) + printk(KERN_WARNING "More than 8 CPUs detected and " + "CONFIG_X86_PC cannot handle it.\nUse " + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); +# endif +#endif + kvm_guest_init(); + + e820_reserve_resources(); + e820_mark_nosave_regions(max_low_pfn); + +#ifdef CONFIG_X86_32 + request_resource(&iomem_resource, &video_ram_resource); +#endif + reserve_standard_io_resources(); + + e820_setup_gap(); + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif +} + diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c deleted file mode 100644 index 4d5f8a3..0000000 --- a/arch/x86/kernel/setup_32.c +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (C) 1995 Linus Torvalds - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * - * Memory region support - * David Parsons <orc@pell.chi.il.us>, July-August 1999 - * - * Added E820 sanitization routine (removes overlapping memory regions); - * Brian Moyle <bmoyle@mvista.com>, February 2001 - * - * Moved CPU detection code to cpu/${cpu}.c - * Patrick Mochel <mochel@osdl.org>, March 2002 - * - * Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/screen_info.h> -#include <linux/ioport.h> -#include <linux/acpi.h> -#include <linux/apm_bios.h> -#include <linux/initrd.h> -#include <linux/bootmem.h> -#include <linux/seq_file.h> -#include <linux/console.h> -#include <linux/mca.h> -#include <linux/root_dev.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/efi.h> -#include <linux/init.h> -#include <linux/edd.h> -#include <linux/iscsi_ibft.h> -#include <linux/nodemask.h> -#include <linux/kexec.h> -#include <linux/dmi.h> -#include <linux/pfn.h> -#include <linux/pci.h> -#include <asm/pci-direct.h> -#include <linux/init_ohci1394_dma.h> -#include <linux/kvm_para.h> - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/delay.h> -#include <linux/highmem.h> - -#include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/iscsi_ibft.h> -#include <linux/kexec.h> -#include <linux/cpufreq.h> -#include <linux/dma-mapping.h> -#include <linux/ctype.h> -#include <linux/uaccess.h> - -#include <linux/percpu.h> -#include <linux/crash_dump.h> - -#include <video/edid.h> - -#include <asm/mtrr.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/arch_hooks.h> -#include <asm/sections.h> -#include <asm/dmi.h> -#include <asm/io_apic.h> -#include <asm/ist.h> -#include <asm/vmi.h> -#include <setup_arch.h> -#include <asm/bios_ebda.h> -#include <asm/cacheflush.h> -#include <asm/processor.h> -#include <asm/bugs.h> - -#include <asm/system.h> -#include <asm/vsyscall.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/dma.h> -#include <asm/gart.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> - -#include <mach_apic.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#else -#define ARCH_SETUP -#endif - -#include <asm/percpu.h> -#include <asm/sections.h> -#include <asm/topology.h> -#include <asm/apicdef.h> -#ifdef CONFIG_X86_32 -#include <asm/highmem.h> -#endif - -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_start __initdata = ~0UL; -unsigned long init_pg_tables_end __initdata = ~0UL; - -/* - * Machine setup.. - */ -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - - -#ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; -/* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; -EXPORT_SYMBOL(boot_cpu_data); -static void set_mca_bus(int x) -{ -#ifdef CONFIG_MCA - MCA_bus = x; -#endif -} - -unsigned int def_to_bigsmp; - -/* for MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; - -struct apm_info apm_info; -EXPORT_SYMBOL(apm_info); - -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ - defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) -struct ist_info ist_info; -EXPORT_SYMBOL(ist_info); -#else -struct ist_info ist_info; -#endif - -#else -struct cpuinfo_x86 boot_cpu_data __read_mostly; -EXPORT_SYMBOL(boot_cpu_data); -#endif - - -#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; -#else -unsigned long mmu_cr4_features = X86_CR4_PAE; -#endif - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -/* - * Early DMI memory - */ -int dmi_alloc_index; -char dmi_alloc_data[DMI_MAX_DATA]; - -/* - * Setup options - */ -struct screen_info screen_info; -EXPORT_SYMBOL(screen_info); -struct edid_info edid_info; -EXPORT_SYMBOL_GPL(edid_info); - -extern int root_mountflags; - -unsigned long saved_video_mode; - -#define RAMDISK_IMAGE_START_MASK 0x07FF -#define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 - -static char __initdata command_line[COMMAND_LINE_SIZE]; - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, - sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); - edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; - edd.edd_info_nr = boot_params.eddbuf_entries; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -#ifdef CONFIG_BLK_DEV_INITRD - -#ifdef CONFIG_X86_32 - -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -static void __init relocate_initrd(void) -{ - - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; - u64 ramdisk_here; - unsigned long slop, clen, mapaddr; - char *p, *q; - - /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, - PAGE_SIZE); - - if (ramdisk_here == -1ULL) - panic("Cannot find place for new RAMDISK of size %lld\n", - ramdisk_size); - - /* Note: this includes all the lowmem currently occupied by - the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, - "NEW RAMDISK"); - initrd_start = ramdisk_here + PAGE_OFFSET; - initrd_end = initrd_start + ramdisk_size; - printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", - ramdisk_here, ramdisk_here + ramdisk_size); - - q = (char *)initrd_start; - - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ - while (ramdisk_size) { - slop = ramdisk_image & ~PAGE_MASK; - clen = ramdisk_size; - if (clen > MAX_MAP_CHUNK-slop) - clen = MAX_MAP_CHUNK-slop; - mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); - memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - /* high pages is not converted by early_res_to_bootmem */ - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; - printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" - " %08llx - %08llx\n", - ramdisk_image, ramdisk_image + ramdisk_size - 1, - ramdisk_here, ramdisk_here + ramdisk_size - 1); -} -#endif - -static void __init reserve_initrd(void) -{ - u64 ramdisk_image = boot_params.hdr.ramdisk_image; - u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; - - if (!boot_params.hdr.type_of_loader || - !ramdisk_image || !ramdisk_size) - return; /* No initrd provided by bootloader */ - - initrd_start = 0; - - if (ramdisk_size >= (end_of_lowmem>>1)) { - free_early(ramdisk_image, ramdisk_end); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; - } - - printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, - ramdisk_end); - - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ - /* - * don't need to reserve again, already reserved early - * in i386_start_kernel - */ - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start + ramdisk_size; - return; - } - -#ifdef CONFIG_X86_32 - relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif - free_early(ramdisk_image, ramdisk_end); -} -#else -static void __init reserve_initrd(void) -{ -} -#endif /* CONFIG_BLK_DEV_INITRD */ - -/* - * Determine if we were loaded by an EFI loader. If so, then we have also been - * passed the efi memmap, systab, etc., so we should use these data structures - * for initialization. Note, the efi init code path is determined by the - * global efi_enabled. This allows the same kernel image to be used on existing - * systems (with a traditional BIOS) as well as on EFI systems. - */ -/* - * setup_arch - architecture-specific boot-time initializations - * - * Note: On x86_64, fixmaps are ready for use even before this is called. - */ - -void __init setup_arch(char **cmdline_p) -{ -#ifdef CONFIG_X86_32 - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - pre_setup_arch_hook(); - early_cpu_init(); - early_ioremap_init(); - reserve_setup_data(); -#else - printk(KERN_INFO "Command line: %s\n", boot_command_line); -#endif - - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; - if (boot_params.sys_desc_table.length != 0) { - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); - machine_id = boot_params.sys_desc_table.table[0]; - machine_submodel_id = boot_params.sys_desc_table.table[1]; - BIOS_revision = boot_params.sys_desc_table.table[2]; - } -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 - "EL32", -#else - "EL64", -#endif - 4)) { - efi_enabled = 1; - efi_reserve_early(); - } -#endif - - ARCH_SETUP - - setup_memory_map(); - copy_edd(); - - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; -#ifdef CONFIG_X86_32 - init_mm.brk = init_pg_tables_end + PAGE_OFFSET; -#else - init_mm.brk = (unsigned long) &_end; -#endif - - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; - -#ifdef CONFIG_X86_64 - early_cpu_init(); -#endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_setup_data(); - - parse_early_param(); - - if (acpi_mps_check()) { -#ifdef CONFIG_X86_LOCAL_APIC -#ifdef CONFIG_X86_32 - enable_local_apic = -1; -#else - disable_apic = 1; -#endif -#endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - } - - finish_e820_parsing(); - -#ifdef CONFIG_X86_32 - probe_roms(); -#else -# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - if (init_ohci1394_dma_early) - init_ohci1394_dma_on_all_controllers(); -# endif -#endif - - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); - - if (efi_enabled) - efi_init(); - -#ifdef CONFIG_X86_32 - if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); - } -#else - early_gart_iommu_check(); -#endif - - e820_register_active_regions(0, 0, -1UL); - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - max_pfn = e820_end_of_ram(); - - /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); - /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); - if (mtrr_trim_uncached_memory(max_pfn)) { - remove_all_active_ranges(); - e820_register_active_regions(0, 0, -1UL); - max_pfn = e820_end_of_ram(); - } - -#ifdef CONFIG_X86_32 - /* max_low_pfn get updated here */ - find_low_pfn_range(); -#else - num_physpages = max_pfn; - - check_efer(); - - /* How many end-of-memory variables you have, grandma! */ - /* need this before calling reserve_initrd */ - max_low_pfn = max_pfn; - high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; -#endif - - /* max_pfn_mapped is updated here */ -#ifdef CONFIG_X86_64 - max_pfn_mapped = -#endif - max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); - - reserve_initrd(); - -#ifdef CONFIG_X86_64 - vsmp_init(); -#endif - - dmi_scan_machine(); - - io_delay_init(); - - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - -#ifdef CONFIG_X86_64 - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); -#endif - -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - - initmem_init(0, max_pfn); - -#ifdef CONFIG_X86_64 - dma32_reserve_bootmem(); -#endif - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif -#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \ - defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64) - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif - reserve_crashkernel(); - - reserve_ibft_region(); - -#ifdef CONFIG_KVM_CLOCK - kvmclock_init(); -#endif - -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be after max_low_pfn is determined, and before kernel - * pagetables are setup. - */ - vmi_init(); -#endif - - paging_init(); - -#ifdef CONFIG_X86_64 - map_vsyscall(); -#endif - - /* - * NOTE: On x86-32, only from this point on, fixmaps are ready for use. - */ - -#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32) - if (init_ohci1394_dma_early) - init_ohci1394_dma_on_all_controllers(); -#endif - -#ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(); -#endif - - early_quirks(); - - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); - -#ifdef CONFIG_X86_64 - init_cpu_to_node(); -#endif - -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); -#endif - -#ifdef CONFIG_X86_64 - init_apic_mappings(); - ioapic_init_mappings(); -#else -# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) - if (def_to_bigsmp) - printk(KERN_WARNING "More than 8 CPUs detected and " - "CONFIG_X86_PC cannot handle it.\nUse " - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -# endif -#endif - kvm_guest_init(); - - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); - -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); - - e820_setup_gap(); - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif -} - -- cgit v1.1 From 217b8ce89088dd47e7176686ff34573f75a624e9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:55:20 -0700 Subject: x86: move boot_params back to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 6 ++++++ arch/x86/kernel/setup_percpu.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 222d703..f24a8fe 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -118,6 +118,12 @@ #include <asm/highmem.h> #endif +#ifndef CONFIG_DEBUG_BOOT_PARAMS +struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif + /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5c0c4bb..5497fb9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -15,12 +15,6 @@ #include <asm/apicdef.h> #include <asm/highmem.h> -#ifndef CONFIG_DEBUG_BOOT_PARAMS -struct boot_params __initdata boot_params; -#else -struct boot_params boot_params; -#endif - #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; -- cgit v1.1 From 257b0fde99df0160db03e529dbfb3a4e46c07a88 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:56:22 -0700 Subject: x86: move parse_setup_data back to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 25 +++++++++++++++++++++++++ arch/x86/kernel/setup_percpu.c | 25 ------------------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f24a8fe..4b00c83 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -375,6 +375,31 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + case SETUP_E820_EXT: + parse_e820_ext(data, pa_data); + break; + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5497fb9..2f80750 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -214,31 +214,6 @@ void __init setup_per_cpu_areas(void) #endif -void __init parse_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); - switch (data->type) { - case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); - break; - default: - break; - } -#ifndef CONFIG_DEBUG_BOOT_PARAMS - free_early(pa_data, pa_data+sizeof(*data)+data->len); -#endif - pa_data = data->next; - early_iounmap(data, PAGE_SIZE); - } -} - #ifdef X86_64_NUMA /* -- cgit v1.1 From ccb4defa71744f086822950d8fa64a17c4e6eb04 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:57:13 -0700 Subject: x86: move back crashkernel back to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 58 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_percpu.c | 57 ----------------------------------------- 2 files changed, 58 insertions(+), 57 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4b00c83..75d3540 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -401,6 +401,64 @@ void __init parse_setup_data(void) } /* + * --------- Crashkernel reservation ------------------------------ + */ + +#ifdef CONFIG_KEXEC +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base <= 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + } +} +#else +void __init reserve_crashkernel(void) +{ +} +#endif + +/* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures * for initialization. Note, the efi init code path is determined by the diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2f80750..a3a19ab 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -387,63 +387,6 @@ EXPORT_SYMBOL(node_to_cpumask); #endif /* X86_64_NUMA */ - -/* - * --------- Crashkernel reservation ------------------------------ - */ - -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif - - return total << PAGE_SHIFT; -} - -#ifdef CONFIG_KEXEC -void __init reserve_crashkernel(void) -{ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; - int ret; - - total_mem = get_total_mem(); - - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); - return; - } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); - return; - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } -} -#else -void __init reserve_crashkernel(void) -{} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, -- cgit v1.1 From bdba0e700c86fa2f152b1fe37b001c9e9c65d2b7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:58:02 -0700 Subject: x86: move reserve_standard_io_resources back to setup.c Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/kernel/setup_percpu.c | 32 -------------------------------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 75d3540..62647b0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -458,6 +458,39 @@ void __init reserve_crashkernel(void) } #endif +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a3a19ab..ccf329d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -387,38 +387,6 @@ EXPORT_SYMBOL(node_to_cpumask); #endif /* X86_64_NUMA */ -static struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x60, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x64, .end = 0x64, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -void __init reserve_standard_io_resources(void) -{ - int i; - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - -} #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header -- cgit v1.1 From 0196bcbb150786d54a50e3074013020570a59d31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:58:55 -0700 Subject: x86: move parse elfvorehdr back to setup.c Signed-off-by: Yinghai <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 16 ++++++++++++++++ arch/x86/kernel/setup_percpu.c | 19 ------------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 62647b0..08efab5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -491,6 +491,22 @@ void __init reserve_standard_io_resources(void) } +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); +#endif + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ccf329d..7068f95 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -387,22 +387,3 @@ EXPORT_SYMBOL(node_to_cpumask); #endif /* X86_64_NUMA */ - -#ifdef CONFIG_PROC_VMCORE -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - - - -- cgit v1.1 From d1b20afec356085a202d7832d47bfb89303ea901 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 17:59:41 -0700 Subject: x86: make x86_find_smp_config depends on 64 bit too Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 1 - arch/x86/kernel/setup.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25ff754..9441e46 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -233,7 +233,6 @@ config SMP config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER || X86_VISWS - depends on X86_32 if ACPI config X86_MPPARSE diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08efab5..7690547 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -714,8 +714,7 @@ void __init setup_arch(char **cmdline_p) */ acpi_reserve_bootmem(); #endif -#if defined(CONFIG_X86_FIND_SMP_CONFIG) && defined(CONFIG_X86_32) || \ - defined(CONFIG_X86_MPPARSE) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_FIND_SMP_CONFIG /* * Find and reserve possible boot-time SMP configuration: */ -- cgit v1.1 From 29f784e369a914b5926e01a0b0caae0b47f6452a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 18:00:22 -0700 Subject: x86: change some functions in setup.c to static Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 8 ++++---- include/asm-x86/bootparam.h | 1 - include/asm-x86/setup.h | 5 ----- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7690547..35022dc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -375,7 +375,7 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -void __init parse_setup_data(void) +static void __init parse_setup_data(void) { struct setup_data *data; u64 pa_data; @@ -417,7 +417,7 @@ static inline unsigned long long get_total_mem(void) return total << PAGE_SHIFT; } -void __init reserve_crashkernel(void) +static void __init reserve_crashkernel(void) { unsigned long long total_mem; unsigned long long crash_size, crash_base; @@ -453,7 +453,7 @@ void __init reserve_crashkernel(void) } } #else -void __init reserve_crashkernel(void) +static void __init reserve_crashkernel(void) { } #endif @@ -481,7 +481,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -void __init reserve_standard_io_resources(void) +static void __init reserve_standard_io_resources(void) { int i; diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index 55ae9d0..6eeba3b 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -109,6 +109,5 @@ struct boot_params { } __attribute__((packed)); void reserve_setup_data(void); -void parse_setup_data(void); #endif /* _ASM_BOOTPARAM_H */ diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index bb12a16..269ba7f 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -8,9 +8,6 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); -/* Crashkernel reservation */ -void reserve_crashkernel(void); - #ifndef CONFIG_PARAVIRT #define paravirt_post_allocator_init() do {} while (0) #endif @@ -38,8 +35,6 @@ void reserve_crashkernel(void); #ifndef __ASSEMBLY__ #include <asm/bootparam.h> -void reserve_standard_io_resources(void); - #ifndef _SETUP /* -- cgit v1.1 From 5092301c7250d7459b79fe40dae43eca3b518e92 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 18:02:06 -0700 Subject: x86: we only have init_pg_tables_end for 32bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 35022dc..f2e314b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,12 +124,6 @@ struct boot_params __initdata boot_params; struct boot_params boot_params; #endif -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_start __initdata = ~0UL; -unsigned long init_pg_tables_end __initdata = ~0UL; - /* * Machine setup.. */ @@ -156,6 +150,12 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 +/* This value is set up by the early boot code to point to the value + immediately after the boot time page tables. It contains a *physical* + address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_start __initdata = ~0UL; +unsigned long init_pg_tables_end __initdata = ~0UL; + static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, -- cgit v1.1 From 3442682a54a9f44047efe526869aa52bd74fe16e Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 26 Jun 2008 12:29:29 +0200 Subject: x86: remove extra newline from setup.c Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f2e314b..172a83e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -806,4 +806,3 @@ void __init setup_arch(char **cmdline_p) #endif #endif } - -- cgit v1.1 From 330ddd20894f99a2b956ad59cf0cfdba188bde63 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 26 Jun 2008 12:40:35 +0200 Subject: x86: build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: In file included from arch/x86/kernel/setup.c:118: include/asm/highmem.h:64: error: expected identifier or ‘(' before ‘do' include/asm/highmem.h:64: error: expected identifier or ‘(' before ‘while' include/asm/highmem.h:67: error: expected identifier or ‘(' before ‘do' include/asm/highmem.h:67: error: expected identifier or ‘(' before ‘while' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 172a83e..63dbb5e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,9 +114,6 @@ #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif -#ifdef CONFIG_X86_32 -#include <asm/highmem.h> -#endif #ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; -- cgit v1.1 From 102e3b8d3f3d5556c60f9ab6d92108649b68edc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:09 -0400 Subject: x86, 64-bit: add prototype for x86_64_start_kernel() Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/setup.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 269ba7f..42bc62b 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -56,6 +56,8 @@ extern void probe_roms(void); extern unsigned long init_pg_tables_start; extern unsigned long init_pg_tables_end; +#else +void __init x86_64_start_kernel(char *real_mode); #endif /* __i386__ */ #endif /* _SETUP */ #endif /* __ASSEMBLY__ */ -- cgit v1.1 From 15878c0b21b7b04a08108e9027ebbbd68a2502e0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:10 -0400 Subject: x86, 64-bit: add sync_cmpxchg Add sync_cmpxchg to match 32-bit's sync_cmpxchg. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/cmpxchg_64.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/asm-x86/cmpxchg_64.h b/include/asm-x86/cmpxchg_64.h index d9b26b9..17463cc 100644 --- a/include/asm-x86/cmpxchg_64.h +++ b/include/asm-x86/cmpxchg_64.h @@ -93,6 +93,39 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, return old; } +/* + * Always use locked operations when touching memory shared with a + * hypervisor, since the system may be SMP even if the guest kernel + * isn't. + */ +static inline unsigned long __sync_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + asm volatile("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + asm volatile("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + asm volatile("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + static inline unsigned long __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new, int size) @@ -139,6 +172,10 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ (unsigned long)(n), \ sizeof(*(ptr)))) +#define sync_cmpxchg(ptr, o, n) \ + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), \ + sizeof(*(ptr)))) #define cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ -- cgit v1.1 From 67350a5c4514c280665cdb45439d32a008a264ba Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:11 -0400 Subject: x86: simplify vmalloc_sync_all vmalloc_sync_all() is only called from register_die_notifier and alloc_vm_area. Neither is on any performance-critical paths, so vmalloc_sync_all() itself is not on any hot paths. Given that the optimisations in vmalloc_sync_all add a fair amount of code and complexity, and are fairly hard to evaluate for correctness, it's better to just remove them to simplify the code rather than worry about its absolute performance. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/fault.c | 77 ++++++++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 51 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 578b768..d0f5fce 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -903,14 +903,7 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { #ifdef CONFIG_X86_32 - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; + unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; if (SHARED_KERNEL_PMD) @@ -918,56 +911,38 @@ void vmalloc_sync_all(void) BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; + spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - set_bit(pgd_index(address), insync); + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } - if (address == start) - start = address + PGDIR_SIZE; + spin_unlock_irqrestore(&pgd_lock, flags); } #endif } -- cgit v1.1 From eba0045ff87bab465d3c80c289f3bf709c1800f5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:12 -0400 Subject: x86/paravirt: add a pgd_alloc/free hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add hooks which are called at pgd_alloc/free time. The pgd_alloc hook may return an error code, which if non-zero, causes the pgd allocation to be failed. The hooks may be used to allocate/free auxillary per-pgd information. also fix: > * Ingo Molnar <mingo@elte.hu> wrote: > > include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted > arch/x86/kernel/entry_64.S: In file included from > arch/x86/kernel/traps_64.c:51:include/asm/pgalloc.h: In function ‘paravirt_pgd_free': > include/asm/pgalloc.h:14: error: parameter name omitted Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/mm/pgtable.c | 13 ++++++++----- arch/x86/xen/enlighten.c | 4 ++++ include/asm-x86/paravirt.h | 19 ++++++++++++++++++- include/asm-x86/pgalloc.h | 4 ++++ 5 files changed, 38 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e9b5045..78c9a1b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -30,6 +30,7 @@ #include <asm/setup.h> #include <asm/arch_hooks.h> #include <asm/time.h> +#include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> @@ -366,6 +367,9 @@ struct pv_mmu_ops pv_mmu_ops = { .flush_tlb_single = native_flush_tlb_single, .flush_tlb_others = native_flush_tlb_others, + .pgd_alloc = __paravirt_pgd_alloc, + .pgd_free = paravirt_nop, + .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pmd_clone = paravirt_nop, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 45b99ac..418c443 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -215,13 +215,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) /* so that alloc_pmd can use it */ mm->pgd = pgd; - if (pgd) + if (pgd) { pgd_ctor(pgd); - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; + if (paravirt_pgd_alloc(mm) != 0 || + !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } } return pgd; @@ -231,6 +233,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); free_page((unsigned long)pgd); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 76ad1ef..d62f14e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> +#include <asm/pgalloc.h> #include "xen-ops.h" #include "mmu.h" @@ -1153,6 +1154,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pgd_alloc = __paravirt_pgd_alloc, + .pgd_free = paravirt_nop, + .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, .alloc_pmd = xen_alloc_pte_init, diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 41f5447..5467e2c 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -219,7 +219,14 @@ struct pv_mmu_ops { void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, unsigned long va); - /* Hooks for allocating/releasing pagetable pages */ + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ void (*alloc_pte)(struct mm_struct *mm, u32 pfn); void (*alloc_pmd)(struct mm_struct *mm, u32 pfn); void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); @@ -925,6 +932,16 @@ static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); } +static inline int paravirt_pgd_alloc(struct mm_struct *mm) +{ + return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); +} + +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd); +} + static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned pfn) { PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn); diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h index 91e4641..d63ea43 100644 --- a/include/asm-x86/pgalloc.h +++ b/include/asm-x86/pgalloc.h @@ -5,9 +5,13 @@ #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else +#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, -- cgit v1.1 From d8d5900ef8afc562088f8470feeaf17c4747790f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:13 -0400 Subject: x86: preallocate and prepopulate separately Jan Beulich points out that vmalloc_sync_all() assumes that the kernel's pmd is always expected to be present in the pgd. The current pgd construction code will add the pgd to the pgd_list before its pmds have been pre-populated, thereby making it visible to vmalloc_sync_all(). However, because pgd_prepopulate_pmd also does the allocation, it may block and cannot be done under spinlock. The solution is to preallocate the pmds out of the spinlock, then populate them while holding the pgd_list lock. This patch also pulls the pmd preallocation and mop-up functions out to be common, assuming that the compiler will generate no code for them when PREALLOCTED_PMDS is 0. Also, there's no need for pgd_ctor to clear the pgd again, since it's allocated as a zeroed page. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pgtable.c | 169 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 68 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 418c443..557b2ab 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -66,12 +66,6 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_ctor(void *p) { pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -91,8 +85,6 @@ static void pgd_ctor(void *p) /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); } static void pgd_dtor(void *pgd) @@ -120,6 +112,72 @@ static void pgd_dtor(void *pgd) #ifdef CONFIG_X86_PAE /* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 + +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(pmd_t *pmds[]) +{ + int i; + + for(i = 0; i < PREALLOCATED_PMDS; i++) + if (pmds[i]) + free_page((unsigned long)pmds[i]); +} + +static int preallocate_pmds(pmd_t *pmds[]) +{ + int i; + bool failed = false; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + if (pmd == NULL) + failed = true; + pmds[i] = pmd; + } + + if (failed) { + free_pmds(pmds); + return -ENOMEM; + } + + return 0; +} + +/* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be @@ -129,7 +187,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + for(i = 0; i < PREALLOCATED_PMDS; i++) { pgd_t pgd = pgdp[i]; if (pgd_val(pgd) != 0) { @@ -143,32 +201,17 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) } } -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; unsigned long addr; int i; pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } + for (addr = i = 0; i < PREALLOCATED_PMDS; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), @@ -176,57 +219,47 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) pud_populate(mm, pud, pmd); } - - return 1; } -void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +pgd_t *pgd_alloc(struct mm_struct *mm) { - paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + pgd_t *pgd; + pmd_t *pmds[PREALLOCATED_PMDS]; + unsigned long flags; - /* Note: almost everything apart from _PAGE_PRESENT is - reserved at the pmd (PDPT) level. */ - set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (preallocate_pmds(pmds) != 0) + goto out_free_pgd; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_pmds; /* - * According to Intel App note "TLBs, Paging-Structure Caches, - * and Their Invalidation", April 2007, document 317080-001, - * section 8.1: in PAE mode we explicitly have to flush the - * TLB via cr3 if the top-level pgd is changed... + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. */ - if (mm == current->active_mm) - write_cr3(read_cr3()); -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) -{ -} -#endif /* CONFIG_X86_PAE */ + spin_lock_irqsave(&pgd_lock, flags); -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + pgd_ctor(pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); - /* so that alloc_pmd can use it */ - mm->pgd = pgd; - if (pgd) { - pgd_ctor(pgd); - - if (paravirt_pgd_alloc(mm) != 0 || - !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - } + spin_unlock_irqrestore(&pgd_lock, flags); return pgd; + +out_free_pmds: + free_pmds(pmds); +out_free_pgd: + free_page((unsigned long)pgd); +out: + return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) -- cgit v1.1 From 97349135fea7f0ba8464534433df3bfd1dc0e9a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:14 -0400 Subject: x86/paravirt: add debugging for missing operations Rather than just jumping to 0 when there's a missing operation, raise a BUG. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 7 +++++++ include/asm-x86/paravirt.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9441e46..67e5275 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -435,6 +435,13 @@ config PARAVIRT_CLOCK endif +config PARAVIRT_DEBUG + bool "paravirt-ops debugging" + depends on PARAVIRT && DEBUG_KERNEL + help + Enable to debug paravirt_ops internals. Specifically, BUG if + a paravirt_op is missing when it is called. + config MEMTEST bool "Memtest" depends on X86_64 diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 5467e2c..198293b 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -459,10 +459,17 @@ int paravirt_disable_iospace(void); #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" #endif +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + #define __PVOP_CALL(rettype, op, pre, post, ...) \ ({ \ rettype __ret; \ PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ /* This is 32-bit specific, but is okay in 64-bit */ \ /* since this condition will never hold */ \ if (sizeof(rettype) > sizeof(unsigned long)) { \ @@ -491,6 +498,7 @@ int paravirt_disable_iospace(void); #define __PVOP_VCALL(op, pre, post, ...) \ ({ \ PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ -- cgit v1.1 From 491eccb721c2ee67250273a96e4515fb5b423337 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:15 -0400 Subject: x86/paravirt: define PARA_INDIRECT for indirect asm calls On 32-bit it's best to use a %cs: prefix to access memory where the other segments may not bet set up properly yet. On 64-bit it's best to use a rip-relative addressing mode. Define PARA_INDIRECT() to abstract this and generate the proper addressing mode in each case. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/paravirt.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 198293b..82cdcde 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -1455,51 +1455,53 @@ static inline unsigned long __raw_local_irq_save(void) #define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) +#define PARA_INDIRECT(addr) *addr(%rip) #else #define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx #define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) +#define PARA_INDIRECT(addr) *%cs:addr #endif #define INTERRUPT_RETURN \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ - jmp *%cs:pv_cpu_ops+PV_CPU_iret) + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ - PV_SAVE_REGS; \ - call *%cs:pv_irq_ops+PV_IRQ_irq_disable; \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ PV_RESTORE_REGS;) \ #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ - PV_SAVE_REGS; \ - call *%cs:pv_irq_ops+PV_IRQ_irq_enable; \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS;) #define ENABLE_INTERRUPTS_SYSCALL_RET \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\ CLBR_NONE, \ - jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret) + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)) #ifdef CONFIG_X86_32 -#define GET_CR0_INTO_EAX \ - push %ecx; push %edx; \ - call *pv_cpu_ops+PV_CPU_read_cr0; \ +#define GET_CR0_INTO_EAX \ + push %ecx; push %edx; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx #else #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ PV_SAVE_REGS; \ - call *pv_cpu_ops+PV_CPU_swapgs; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ PV_RESTORE_REGS \ ) -#define GET_CR2_INTO_RCX \ - call *pv_mmu_ops+PV_MMU_read_cr2; \ - movq %rax, %rcx; \ +#define GET_CR2_INTO_RCX \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \ + movq %rax, %rcx; \ xorq %rax, %rax; #endif -- cgit v1.1 From a6523748bddd38bcec11431f57502090b6014a96 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost <ehabkost@redhat.com> Date: Wed, 25 Jun 2008 00:19:16 -0400 Subject: paravirt/x86, 64-bit: move __PAGE_OFFSET to leave a space for hypervisor Set __PAGE_OFFSET to the most negative possible address + 16*PGDIR_SIZE. The gap is to allow a space for a hypervisor to fit. The gap is more or less arbitrary, but it's what Xen needs. When booting native, kernel/head_64.S has a set of compile-time generated pagetables used at boot time. This patch removes their absolutely hard-coded layout, and makes it parameterised on __PAGE_OFFSET (and __START_KERNEL_map). Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 17 ++++++++++++----- include/asm-x86/page_64.h | 8 +++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 32f5a11..38279fa 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -32,6 +32,13 @@ * */ +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) +L4_START_KERNEL = pgd_index(__START_KERNEL_map) +L3_START_KERNEL = pud_index(__START_KERNEL_map) + .text .section .text.head .code64 @@ -77,8 +84,8 @@ startup_64: /* Fixup the physical addresses in the page table */ addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (258*8)(%rip) - addq %rbp, init_level4_pgt + (511*8)(%rip) + addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) + addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_ident_pgt + 0(%rip) @@ -338,9 +345,9 @@ ENTRY(name) */ NEXT_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 257,8,0 + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 252,8,0 + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE @@ -349,7 +356,7 @@ NEXT_PAGE(level3_ident_pgt) .fill 511,8,0 NEXT_PAGE(level3_kernel_pgt) - .fill 510,8,0 + .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index cac5b9e..010d12d 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -26,7 +26,13 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PAGE_OFFSET _AC(0xffff810000000000, UL) +/* + * Set __PAGE_OFFSET to the most negative possible address + + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a + * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's + * what Xen requires. + */ +#define __PAGE_OFFSET _AC(0xffff880000000000, UL) #define __PHYSICAL_START CONFIG_PHYSICAL_START #define __KERNEL_ALIGN 0x200000 -- cgit v1.1 From 408011759cc8ff7f89505e8398cec0ccf67b5afa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:17 -0400 Subject: x86, 64-bit: add FIX_PARAVIRT_BOOTMAP fixmap slot This matches 32 bit. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/fixmap_64.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h index c7dcc36..1a0b61e 100644 --- a/include/asm-x86/fixmap_64.h +++ b/include/asm-x86/fixmap_64.h @@ -46,6 +46,9 @@ enum fixed_addresses { FIX_EFI_IO_MAP_LAST_PAGE, FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE + MAX_EFI_IO_PAGES - 1, +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, #endif -- cgit v1.1 From f97013fd8f17120182aa247f360e4d2069a9db9c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:18 -0400 Subject: x86, 64-bit: split x86_64_start_kernel Split x86_64_start_kernel() into two pieces: The first essentially cleans up after head_64.S. It clears the bss, zaps low identity mappings, sets up some early exception handlers. The second part preserves the boot data, reserves the kernel's text/data/bss, pagetables and ramdisk, and then starts the kernel proper. This split is so that Xen can call the second part to do the set up it needs done. It doesn't need any of the first part setups, because it doesn't boot via head_64.S, and its redundant or actively damaging. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head64.c | 5 +++++ include/asm-x86/setup.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c970929..f684e3b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -108,6 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel really alive\n"); + x86_64_start_reservations(real_mode_data); +} + +void __init x86_64_start_reservations(char *real_mode_data) +{ copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 42bc62b..1d121c6 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -58,6 +58,8 @@ extern unsigned long init_pg_tables_end; #else void __init x86_64_start_kernel(char *real_mode); +void __init x86_64_start_reservations(char *real_mode_data); + #endif /* __i386__ */ #endif /* _SETUP */ #endif /* __ASSEMBLY__ */ -- cgit v1.1 From 4f9c11dd49fb73e1ec088b27ed6539681a445988 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:19 -0400 Subject: x86, 64-bit: adjust mapping of physical pagetables to work with Xen This makes a few of changes to the construction of the initial pagetables to work better with paravirt_ops/Xen. The main areas are: 1. Support non-PSE mapping of memory, since Xen doesn't currently allow 2M pages to be mapped in guests. 2. Make sure that the ioremap alias of all pages are dropped before attaching the new page to the pagetable. This avoids having writable aliases of pagetable pages. 3. Preserve existing pagetable entries, rather than overwriting. Its possible that a fair amount of pagetable has already been constructed, so reuse what's already in place rather than ignoring and overwriting it. The algorithm relies on the invariant that any page which is part of the kernel pagetable is itself mapped in the linear memory area. This way, it can avoid using ioremap on a pagetable page. The invariant holds because it maps memory from low to high addresses, and also allocates memory from low to high. Each allocated page can map at least 2M of address space, so the mapped area will always progress much faster than the allocated area. It relies on the early boot code mapping enough pages to get started. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 94 ++++++++++++++++++++++++++++++++++++++++++++------- arch/x86/mm/ioremap.c | 2 +- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d5d4b04..363751d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -253,6 +253,43 @@ static __meminit void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } +static void __meminit +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +{ + unsigned pages = 0; + int i; + pte_t *pte = pte_page + pte_index(addr); + + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { + + if (addr >= end) { + if (!after_bootmem) { + for(; i < PTRS_PER_PTE; i++, pte++) + set_pte(pte, __pte(0)); + } + break; + } + + if (pte_val(*pte)) + continue; + + if (0) + printk(" pte=%p addr=%lx pte=%016lx\n", + pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + pages++; + } + update_page_count(PG_LEVEL_4K, pages); +} + +static void __meminit +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +{ + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + + phys_pte_init(pte, address, end); +} + static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { @@ -261,7 +298,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); + pte_t *pte; if (address >= end) { if (!after_bootmem) { @@ -271,12 +310,23 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) break; } - if (pmd_val(*pmd)) + if (pmd_val(*pmd)) { + phys_pte_update(pmd, address, end); + continue; + } + + if (cpu_has_pse) { + pages++; + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); continue; + } - pages++; - set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pte = alloc_low_page(&pte_phys); + phys_pte_init(pte, address, end); + unmap_low_page(pte); + + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); } update_page_count(PG_LEVEL_2M, pages); return address; @@ -333,11 +383,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); last_map_addr = phys_pmd_init(pmd, addr, end); + unmap_low_page(pmd); + pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(pmd); } __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -345,16 +395,30 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) return last_map_addr; } +static unsigned long __meminit +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + pud_t *pud; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + + return phys_pud_init(pud, addr, end); +} + static void __init find_early_table_space(unsigned long end) { - unsigned long puds, pmds, tables, start; + unsigned long puds, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); if (!direct_gbpages) { - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); } + if (!cpu_has_pse) { + unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + } /* * RED-PEN putting page tables only on node 0 could @@ -526,19 +590,25 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon unsigned long pud_phys; pud_t *pud; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_val(*pgd)) { + last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); + continue; + } + if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else pud = alloc_low_page(&pud_phys); - next = start + PGDIR_SIZE; - if (next > end) - next = end; last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); + unmap_low_page(pud); if (!after_bootmem) pgd_populate(&init_mm, pgd_offset_k(start), __va(pud_phys)); - unmap_low_page(pud); } if (!after_bootmem) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 092b3d7..45e546c 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -485,7 +485,7 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else - pte_clear(NULL, addr, pte); + pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } -- cgit v1.1 From 7c934d3990aa4d785feddcef700f4c2c4aba2251 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:20 -0400 Subject: x86, 64-bit: create small vmemmap mappings if PSE not available If PSE is not available, then fall back to 4k page mappings for the vmemmap area. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 62 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 363751d..e62cbdd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -988,7 +988,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) pmd_t *pmd; for (; addr < end; addr = next) { - next = pmd_addr_end(addr, end); + void *p = NULL; pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -998,33 +998,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (!pud) return -ENOMEM; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - pte_t entry; - void *p; + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = vmemmap_pmd_populate(pud, addr, node); + + if (!pmd) + return -ENOMEM; + + p = vmemmap_pte_populate(pmd, addr, node); - p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); - - /* check to see if we have contiguous blocks */ - if (p_end != p || node_start != node) { - if (p_start) - printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", - addr_start, addr_end-1, p_start, p_end-1, node_start); - addr_start = addr; - node_start = node; - p_start = p; - } - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; + addr_end = addr + PAGE_SIZE; + p_end = p + PAGE_SIZE; } else { - vmemmap_verify((pte_t *)pmd, node, addr, next); + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + + p = vmemmap_alloc_block(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); } + } return 0; } -- cgit v1.1 From 4f30cb0262847392d8d006042f24bd90abd24f9d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:21 -0400 Subject: x86, 64-bit: PSE no longer a hard requirement Because Xen doesn't support PSE mappings in guests, all code which assumed the presence of PSE has been changed to fall back to smaller mappings if necessary. As a result, PSE is optional rather than required (though still used whereever possible). Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/required-features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86/required-features.h b/include/asm-x86/required-features.h index 8c38719..adec887 100644 --- a/include/asm-x86/required-features.h +++ b/include/asm-x86/required-features.h @@ -42,7 +42,7 @@ #endif #ifdef CONFIG_X86_64 -#define NEED_PSE (1<<(X86_FEATURE_PSE & 31)) +#define NEED_PSE 0 #define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_PGE (1<<(X86_FEATURE_PGE & 31)) #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -- cgit v1.1 From 0814e0bace537b7024b09187346b99401e6281be Mon Sep 17 00:00:00 2001 From: Eduardo Habkost <ehabkost@redhat.com> Date: Wed, 25 Jun 2008 00:19:22 -0400 Subject: x86, 64-bit: split set_pte_vaddr() We will need to set a pte on l3_user_pgt. Extract set_pte_vaddr_pud() from set_pte_vaddr(), that will accept the l3 page table as parameter. This change should be a no-op for existing code. Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 31 ++++++++++++++++++++----------- include/asm-x86/pgtable_64.h | 3 +++ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e62cbdd..5c3305e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -144,22 +144,13 @@ static __init void *spp_getpage(void) } void -set_pte_vaddr(unsigned long vaddr, pte_t new_pte) +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) { - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte)); - - pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - printk(KERN_ERR - "PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - pud = pud_offset(pgd, vaddr); + pud = pud_page + pud_index(vaddr); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); @@ -192,6 +183,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void +set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + pud_t *pud_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud_page = (pud_t*)pgd_page_vaddr(*pgd); + set_pte_vaddr_pud(pud_page, vaddr, pteval); +} + /* * The head.S code sets up the kernel high mapping: * diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 9a9350a..fa7208b 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -70,6 +70,9 @@ extern void paging_init(void); struct mm_struct; +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); + + static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { -- cgit v1.1 From 3fe0a63efd4437f6438ce5f2708929b1108873b6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:23 -0400 Subject: x86, 64-bit: __switch_to(): move arch_leave_lazy_cpu_mode() to the right place We must leave lazy mode before switching the %fs and %gs selectors. Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process_64.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ddc6fcc..488eaca 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -562,6 +562,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_leave_lazy_cpu_mode(); + /* * Switch FS and GS. */ -- cgit v1.1 From 478de5a9d691dd0c048ddce62dbec23722515636 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:24 -0400 Subject: x86: save %fs and %gs before load_TLS() and arch_leave_lazy_cpu_mode() We must do this because load_TLS() may need to clear %fs and %gs. (e.g. under Xen). Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/process_64.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 488eaca..db5eb96 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -538,6 +538,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + unsigned fsindex, gsindex; /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter>5) @@ -560,6 +561,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); + + /* We must save %fs and %gs before load_TLS() because + * %fs and %gs may be cleared by load_TLS(). + * + * (e.g. xen_load_tls()) + */ + savesegment(fs, fsindex); + savesegment(gs, gsindex); + load_TLS(next, cpu); /* @@ -575,8 +585,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch FS and GS. */ { - unsigned fsindex; - savesegment(fs, fsindex); /* segment register != 0 always requires a reload. also reload when it has changed. when prev process used 64bit base always reload @@ -594,10 +602,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; - } - { - unsigned gsindex; - savesegment(gs, gsindex); + if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); if (gsindex) -- cgit v1.1 From e04e0a630d8b5c621b3a8e70ff20db737d3a5728 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:25 -0400 Subject: x86: use __KERNEL_DS as SS when returning to a kernel thread This is needed when the kernel is running on RING3, such as under Xen. x86_64 has a weird feature that makes it #GP on iret when SS is a null descriptor. This need to be tested on bare metal to make sure it doesn't cause any problems. AMD specs say SS is always ignored (except on iret?). Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ff15ab5..6d11014 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -104,7 +104,7 @@ ENTRY(native_irq_enable_syscall_ret) .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ xorl %eax, %eax - pushq %rax /* ss */ + pushq $__KERNEL_DS /* ss */ CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET ss,0*/ pushq %rax /* rsp */ -- cgit v1.1 From d75cd22fdd5f7d203fb60014d426942df33dd9a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:26 -0400 Subject: x86/paravirt: split sysret and sysexit Don't conflate sysret and sysexit; they're different instructions with different semantics, and may be in use at the same time (at least within the same kernel, depending on whether its an Intel or AMD system). sysexit - just return to userspace, does no register restoration of any kind; must explicitly atomically enable interrupts. sysret - reloads flags from r11, so no need to explicitly enable interrupts on 64-bit, responsible for restoring usermode %gs Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/entry_32.S | 8 ++++---- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/paravirt.c | 12 +++++++++--- arch/x86/kernel/paravirt_patch_32.c | 4 ++-- arch/x86/kernel/paravirt_patch_64.c | 4 ++-- arch/x86/kernel/vmi_32.c | 4 ++-- arch/x86/xen/enlighten.c | 2 +- include/asm-x86/irqflags.h | 4 ++-- include/asm-x86/paravirt.h | 15 ++++++++++----- 11 files changed, 36 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 9258808..6649d09 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -111,7 +111,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f126c05..27ac2de 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -62,7 +62,7 @@ int main(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); + OFFSET(PV_CPU_usersp_sysret, pv_cpu_ops, usersp_sysret); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 159a1c7..53393c3 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -58,7 +58,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -349,7 +349,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSCALL_RET + ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -874,10 +874,10 @@ ENTRY(native_iret) .previous END(native_iret) -ENTRY(native_irq_enable_syscall_ret) +ENTRY(native_irq_enable_sysexit) sti sysexit -END(native_irq_enable_syscall_ret) +END(native_irq_enable_sysexit) #endif KPROBE_ENTRY(int3) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6d11014..0056bc4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -59,7 +59,7 @@ #endif #ifdef CONFIG_PARAVIRT -ENTRY(native_irq_enable_syscall_ret) +ENTRY(native_usersp_sysret) movq %gs:pda_oldrsp,%rsp swapgs sysretq @@ -275,7 +275,7 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ - ENABLE_INTERRUPTS_SYSCALL_RET + USERSP_SYSRET CFI_RESTORE_STATE /* Handle reschedules */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 78c9a1b..565ee7a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -140,7 +140,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || + type == PARAVIRT_PATCH(pv_cpu_ops.usersp_sysret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -191,7 +192,8 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_syscall_ret(void); +extern void native_irq_enable_sysexit(void); +extern void native_usersp_sysret(void); static int __init print_banner(void) { @@ -327,7 +329,11 @@ struct pv_cpu_ops pv_cpu_ops = { .write_idt_entry = native_write_idt_entry, .load_sp0 = native_load_sp0, - .irq_enable_syscall_ret = native_irq_enable_syscall_ret, +#ifdef CONFIG_X86_32 + .irq_enable_sysexit = native_irq_enable_sysexit, +#else + .usersp_sysret = native_usersp_sysret, +#endif .iret = native_iret, .swapgs = native_swapgs, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 82fc5fc..5826221 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -5,7 +5,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); @@ -29,7 +29,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 7d904e1..4a17055 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -15,7 +15,7 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); /* the three commands give us more control to how to return from a syscall */ -DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, usersp_sysret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, @@ -35,7 +35,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_cpu_ops, usersp_sysret); PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 956f389..946bf13 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -151,7 +151,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, insns, ip); case PARAVIRT_PATCH(pv_cpu_ops.iret): return patch_internal(VMI_CALL_IRET, len, insns, ip); - case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); default: break; @@ -896,7 +896,7 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; + pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d62f14e..119c88f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1089,7 +1089,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = xen_sysexit, + .irq_enable_sysexit = xen_sysexit, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index c242527..99ee525 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -112,13 +112,13 @@ static inline unsigned long __raw_local_irq_save(void) #ifdef CONFIG_X86_64 #define INTERRUPT_RETURN iretq -#define ENABLE_INTERRUPTS_SYSCALL_RET \ +#define USERSP_SYSRET \ movq %gs:pda_oldrsp, %rsp; \ swapgs; \ sysretq; #else #define INTERRUPT_RETURN iret -#define ENABLE_INTERRUPTS_SYSCALL_RET sti; sysexit +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define GET_CR0_INTO_EAX movl %cr0, %eax #endif diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 82cdcde..2668903 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -141,8 +141,9 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); - /* These two are jmp to, not actually called. */ - void (*irq_enable_syscall_ret)(void); + /* These three are jmp to, not actually called. */ + void (*irq_enable_sysexit)(void); + void (*usersp_sysret)(void); void (*iret)(void); void (*swapgs)(void); @@ -1480,10 +1481,10 @@ static inline unsigned long __raw_local_irq_save(void) call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS;) -#define ENABLE_INTERRUPTS_SYSCALL_RET \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\ +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)) + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) #ifdef CONFIG_X86_32 @@ -1504,6 +1505,10 @@ static inline unsigned long __raw_local_irq_save(void) movq %rax, %rcx; \ xorq %rax, %rax; +#define USERSP_SYSRET \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usersp_sysret), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usersp_sysret)) #endif #endif /* __ASSEMBLY__ */ -- cgit v1.1 From c7245da6ae7e5208504ff027c4e0eec69b788651 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:27 -0400 Subject: x86/paravirt, 64-bit: don't restore user rsp within sysret There's no need to combine restoring the user rsp within the sysret pvop, so split it out. This makes the pvop's semantics closer to the machine instruction. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/entry_64.S | 6 +++--- arch/x86/kernel/paravirt.c | 6 +++--- arch/x86/kernel/paravirt_patch_64.c | 4 ++-- include/asm-x86/irqflags.h | 3 +-- include/asm-x86/paravirt.h | 8 ++++---- 6 files changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 27ac2de..a19aba8 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -62,7 +62,7 @@ int main(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_usersp_sysret, pv_cpu_ops, usersp_sysret); + OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0056bc4..18447a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -59,8 +59,7 @@ #endif #ifdef CONFIG_PARAVIRT -ENTRY(native_usersp_sysret) - movq %gs:pda_oldrsp,%rsp +ENTRY(native_usergs_sysret) swapgs sysretq #endif /* CONFIG_PARAVIRT */ @@ -275,7 +274,8 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ - USERSP_SYSRET + movq %gs:pda_oldrsp, %rsp + USERGS_SYSRET CFI_RESTORE_STATE /* Handle reschedules */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 565ee7a..b0b17f0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -141,7 +141,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || - type == PARAVIRT_PATCH(pv_cpu_ops.usersp_sysret)) + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -193,7 +193,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); -extern void native_usersp_sysret(void); +extern void native_usergs_sysret(void); static int __init print_banner(void) { @@ -332,7 +332,7 @@ struct pv_cpu_ops pv_cpu_ops = { #ifdef CONFIG_X86_32 .irq_enable_sysexit = native_irq_enable_sysexit, #else - .usersp_sysret = native_usersp_sysret, + .usergs_sysret = native_usergs_sysret, #endif .iret = native_iret, .swapgs = native_swapgs, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 4a17055..d4c0712 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -15,7 +15,7 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); /* the three commands give us more control to how to return from a syscall */ -DEF_NATIVE(pv_cpu_ops, usersp_sysret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, @@ -35,7 +35,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, usersp_sysret); + PATCH_SITE(pv_cpu_ops, usergs_sysret); PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index 99ee525..544836c 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -112,8 +112,7 @@ static inline unsigned long __raw_local_irq_save(void) #ifdef CONFIG_X86_64 #define INTERRUPT_RETURN iretq -#define USERSP_SYSRET \ - movq %gs:pda_oldrsp, %rsp; \ +#define USERGS_SYSRET \ swapgs; \ sysretq; #else diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 2668903..dad5b41 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -143,7 +143,7 @@ struct pv_cpu_ops { /* These three are jmp to, not actually called. */ void (*irq_enable_sysexit)(void); - void (*usersp_sysret)(void); + void (*usergs_sysret)(void); void (*iret)(void); void (*swapgs)(void); @@ -1505,10 +1505,10 @@ static inline unsigned long __raw_local_irq_save(void) movq %rax, %rcx; \ xorq %rax, %rax; -#define USERSP_SYSRET \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usersp_sysret), \ +#define USERGS_SYSRET \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usersp_sysret)) + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret)) #endif #endif /* __ASSEMBLY__ */ -- cgit v1.1 From 2be29982a08009c731307f4a39053b70ac4700da Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:28 -0400 Subject: x86/paravirt: add sysret/sysexit pvops for returning to 32-bit compatibility userspace In a 64-bit system, we need separate sysret/sysexit operations to return to a 32-bit userspace. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/ia32/ia32entry.S | 21 ++++++++++---- arch/x86/kernel/asm-offsets_64.c | 4 ++- arch/x86/kernel/entry_64.S | 4 +-- arch/x86/kernel/paravirt.c | 12 ++++---- arch/x86/kernel/paravirt_patch_64.c | 9 ++++-- include/asm-x86/irqflags.h | 14 ++++++++-- include/asm-x86/paravirt.h | 56 +++++++++++++++++++++++++++++-------- 7 files changed, 89 insertions(+), 31 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3aefbce..2a4c424 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -61,6 +61,19 @@ CFI_UNDEFINED r15 .endm +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) + +ENTRY(native_irq_enable_sysexit) + swapgs + sti + sysexit +ENDPROC(native_irq_enable_sysexit) +#endif + /* * 32bit SYSENTER instruction entry. * @@ -151,10 +164,7 @@ sysenter_do_call: CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rsp,rcx TRACE_IRQS_ON - swapgs - sti /* sti only takes effect after the next instruction */ - /* sysexit */ - .byte 0xf, 0x35 + ENABLE_INTERRUPTS_SYSEXIT32 sysenter_tracesys: CFI_RESTORE_STATE @@ -254,8 +264,7 @@ cstar_do_call: TRACE_IRQS_ON movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp - swapgs - sysretl + USERGS_SYSRET32 cstar_tracesys: CFI_RESTORE_STATE diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index a19aba8..06c451a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -62,7 +62,9 @@ int main(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret); + OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); + OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 18447a3..880ffe5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -59,7 +59,7 @@ #endif #ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret) +ENTRY(native_usergs_sysret64) swapgs sysretq #endif /* CONFIG_PARAVIRT */ @@ -275,7 +275,7 @@ sysret_check: RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ movq %gs:pda_oldrsp, %rsp - USERGS_SYSRET + USERGS_SYSRET64 CFI_RESTORE_STATE /* Handle reschedules */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index b0b17f0..bf1067e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -141,7 +141,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || - type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret)) + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -193,7 +194,8 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); -extern void native_usergs_sysret(void); +extern void native_usergs_sysret32(void); +extern void native_usergs_sysret64(void); static int __init print_banner(void) { @@ -329,10 +331,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_idt_entry = native_write_idt_entry, .load_sp0 = native_load_sp0, -#ifdef CONFIG_X86_32 .irq_enable_sysexit = native_irq_enable_sysexit, -#else - .usergs_sysret = native_usergs_sysret, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = native_usergs_sysret32, + .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, .swapgs = native_swapgs, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index d4c0712..061d01d 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); -/* the three commands give us more control to how to return from a syscall */ -DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); +DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); +DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, @@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, usergs_sysret); + PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); + PATCH_SITE(pv_cpu_ops, usergs_sysret32); + PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index 544836c..ea9bd26 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -112,9 +112,17 @@ static inline unsigned long __raw_local_irq_save(void) #ifdef CONFIG_X86_64 #define INTERRUPT_RETURN iretq -#define USERGS_SYSRET \ - swapgs; \ - sysretq; +#define USERGS_SYSRET64 \ + swapgs; \ + sysretq; +#define USERGS_SYSRET32 \ + swapgs; \ + sysretl +#define ENABLE_INTERRUPTS_SYSEXIT32 \ + swapgs; \ + sti; \ + sysexit + #else #define INTERRUPT_RETURN iret #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index dad5b41..33f72f8 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -141,9 +141,32 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); - /* These three are jmp to, not actually called. */ + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ void (*irq_enable_sysexit)(void); - void (*usergs_sysret)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ void (*iret)(void); void (*swapgs)(void); @@ -1481,18 +1504,24 @@ static inline unsigned long __raw_local_irq_save(void) call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS;) -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ +#define USERGS_SYSRET32 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) - + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32)) #ifdef CONFIG_X86_32 #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx -#else + +#define ENABLE_INTERRUPTS_SYSEXIT \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) + + +#else /* !CONFIG_X86_32 */ #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ PV_SAVE_REGS; \ @@ -1505,11 +1534,16 @@ static inline unsigned long __raw_local_irq_save(void) movq %rax, %rcx; \ xorq %rax, %rax; -#define USERGS_SYSRET \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret), \ +#define USERGS_SYSRET64 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret)) -#endif + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#define ENABLE_INTERRUPTS_SYSEXIT32 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) +#endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ -- cgit v1.1 From 6680415481c7bd38967cf7488787f509f17ba307 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:29 -0400 Subject: x86, 64-bit: ia32entry: replace privileged instructions with pvops Replace privileged instructions with the corresponding pvops in ia32entry.S. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/ia32/ia32entry.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2a4c424..9740b6a 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -98,14 +98,14 @@ ENTRY(ia32_sysenter_target) CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp - swapgs + SWAPGS movq %gs:pda_kernelstack, %rsp addq $(PDA_STACKOFFSET),%rsp /* * No need to follow this irqs on/off section: the syscall * disabled irqs, here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) movl %ebp,%ebp /* zero extension */ pushq $__USER32_DS CFI_ADJUST_CFA_OFFSET 8 @@ -147,7 +147,7 @@ sysenter_do_call: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call @@ -210,7 +210,7 @@ ENTRY(ia32_cstar_target) CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ - swapgs + SWAPGS movl %esp,%r8d CFI_REGISTER rsp,r8 movq %gs:pda_kernelstack,%rsp @@ -218,7 +218,7 @@ ENTRY(ia32_cstar_target) * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1,1 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) @@ -251,7 +251,7 @@ cstar_do_call: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call @@ -319,12 +319,12 @@ ENTRY(ia32_syscall) /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP - swapgs + SWAPGS /* * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax pushq %rax CFI_ADJUST_CFA_OFFSET 8 -- cgit v1.1 From a00394f81f419beb6fb9f7023bd4d15913dc625d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:30 -0400 Subject: x86, 64-bit: swapgs pvop with a user-stack can never be called It's never safe to call a swapgs pvop when the user stack is current - it must be inline replaced. Rather than making a call, the SWAPGS_UNSAFE_STACK pvop always just puts "swapgs" as a placeholder, which must either be replaced inline or trap'n'emulated (somehow). Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/irqflags.h | 2 +- include/asm-x86/paravirt.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index ea9bd26..d17e1f6 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -111,6 +111,7 @@ static inline unsigned long __raw_local_irq_save(void) #define DISABLE_INTERRUPTS(x) cli #ifdef CONFIG_X86_64 +#define SWAPGS_UNSAFE_STACK swapgs #define INTERRUPT_RETURN iretq #define USERGS_SYSRET64 \ swapgs; \ @@ -185,7 +186,6 @@ static inline void trace_hardirqs_fixup(void) * Either way, this is a good way to document that we don't * have a reliable stack. x86_64 only. */ -#define SWAPGS_UNSAFE_STACK swapgs #define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk #define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 33f72f8..3286a0c 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -1522,6 +1522,16 @@ static inline unsigned long __raw_local_irq_save(void) #else /* !CONFIG_X86_32 */ + +/* + * If swapgs is used while the userspace stack is still current, + * there's no way to call a pvop. The PV replacement *must* be + * inlined, or the swapgs instruction must be trapped and emulated. + */ +#define SWAPGS_UNSAFE_STACK \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ + swapgs) + #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ PV_SAVE_REGS; \ -- cgit v1.1 From fab58420ac0007a452b540cfb07923225ea4f48d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:31 -0400 Subject: x86/paravirt, 64-bit: add adjust_exception_frame 64-bit Xen pushes a couple of extra words onto an exception frame. Add a hook to deal with them. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/asm-offsets_64.c | 1 + arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/paravirt.c | 3 +++ arch/x86/xen/enlighten.c | 3 +++ include/asm-x86/paravirt.h | 9 +++++++++ include/asm-x86/processor.h | 2 ++ 6 files changed, 20 insertions(+) diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 06c451a..3295e7c 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -61,6 +61,7 @@ int main(void) OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 880ffe5..70fe13a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -736,6 +736,7 @@ END(spurious_interrupt) */ .macro zeroentry sym INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 /* push error code/oldrax */ CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ @@ -748,6 +749,7 @@ END(spurious_interrupt) .macro errorentry sym XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq %rax CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rax,0 diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bf1067e..b20c369 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -296,6 +296,9 @@ struct pv_irq_ops pv_irq_ops = { .irq_enable = native_irq_enable, .safe_halt = native_safe_halt, .halt = native_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = paravirt_nop, +#endif }; struct pv_cpu_ops pv_cpu_ops = { diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 119c88f..3b98083 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1123,6 +1123,9 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { .irq_enable = xen_irq_enable, .safe_halt = xen_safe_halt, .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = paravirt_nop, +#endif }; static const struct pv_apic_ops xen_apic_ops __initdata = { diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 3286a0c..3dc223d 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -189,6 +189,10 @@ struct pv_irq_ops { void (*irq_enable)(void); void (*safe_halt)(void); void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif }; struct pv_apic_ops { @@ -1544,6 +1548,11 @@ static inline unsigned long __raw_local_irq_save(void) movq %rax, %rcx; \ xorq %rax, %rax; +#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ + CLBR_NONE, \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) + #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 12b5ede..df2459f 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -532,6 +532,8 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #define SWAPGS swapgs + +#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ #endif /* CONFIG_PARAVIRT */ /* -- cgit v1.1 From 9f9d489a3e78b49d897734eaaf9dea568dbea66e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Wed, 25 Jun 2008 00:19:32 -0400 Subject: x86/paravirt, 64-bit: make load_gs_index() a paravirt operation Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/paravirt.c | 3 +++ include/asm-x86/elf.h | 2 +- include/asm-x86/paravirt.h | 10 ++++++++++ include/asm-x86/system.h | 3 ++- 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 70fe13a..07d69f2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -944,7 +944,7 @@ KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ -ENTRY(load_gs_index) +ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 @@ -958,7 +958,7 @@ gs_change: CFI_ADJUST_CFA_OFFSET -8 ret CFI_ENDPROC -ENDPROC(load_gs_index) +ENDPROC(native_load_gs_index) .section __ex_table,"a" .align 8 diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index b20c369..27819e3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -329,6 +329,9 @@ struct pv_cpu_ops pv_cpu_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = native_load_gs_index, +#endif .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h index 8f232dc..7be4733 100644 --- a/include/asm-x86/elf.h +++ b/include/asm-x86/elf.h @@ -83,9 +83,9 @@ extern unsigned int vdso_enabled; (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include <asm/processor.h> +#include <asm/system.h> #ifdef CONFIG_X86_32 -#include <asm/system.h> /* for savesegment */ #include <asm/desc.h> #define elf_check_arch(x) elf_check_arch_ia32(x) diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 3dc223d..6d8966f 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -115,6 +115,9 @@ struct pv_cpu_ops { void (*set_ldt)(const void *desc, unsigned entries); unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, const void *desc); void (*write_gdt_entry)(struct desc_struct *, @@ -845,6 +848,13 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu) PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu); } +#ifdef CONFIG_X86_64 +static inline void load_gs_index(unsigned int gs) +{ + PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs); +} +#endif + static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index f686aa6..c4946c5 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -136,7 +136,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \ #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) -extern void load_gs_index(unsigned); +extern void native_load_gs_index(unsigned); /* * Load a segment. Fall back on loading the zero @@ -282,6 +282,7 @@ static inline void native_wbinvd(void) #ifdef CONFIG_X86_64 #define read_cr8() (native_read_cr8()) #define write_cr8(x) (native_write_cr8(x)) +#define load_gs_index native_load_gs_index #endif /* Clear the 'TS' bit */ -- cgit v1.1 From 8207c2570af6f819b61be9ef3fb298d0a8c0e18c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 24 Jun 2008 17:32:48 -0400 Subject: x86: fix pte allocation in "x86: introduce init_memory_mapping for 32bit" The patch "x86: introduce init_memory_mapping for 32bit" does not allocate enough space for PTEs if the CPU does not implement PSE. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Acked-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c059c46..156000d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -733,6 +733,11 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + if (!cpu_has_pse) { + int ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + } + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables -- cgit v1.1 From 611dfd7819e525b45f39ff15e0faf5f23551c113 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Wed, 25 Jun 2008 21:39:16 +0200 Subject: x86: limit E820 map when a user-defined memory map is specified This patch brings back limiting of the E820 map when a user-defined E820 map is specified. While the behaviour of i386 (32 bit) was to limit the E820 map (and /proc/iomem), the behaviour of x86-64 (64 bit) was not to limit. That patch limits the E820 map again for both x86 architectures. Code was tested for compilation and booting on a 32 bit and 64 bit system. Signed-off-by: Bernhard Walle <bwalle@suse.de> Acked-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: kexec@lists.infradead.org Cc: vgoyal@redhat.com Cc: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 1dcb665..7b7685b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1117,6 +1117,9 @@ static int __init parse_memopt(char *p) mem_size = memparse(p, &p); end_user_pfn = mem_size>>PAGE_SHIFT; + e820_update_range(mem_size, ULLONG_MAX - mem_size, + E820_RAM, E820_RESERVED); + return 0; } early_param("mem", parse_memopt); @@ -1161,6 +1164,8 @@ static int __init parse_memmap_opt(char *p) e820_add_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); + e820_update_range(mem_size, ULLONG_MAX - mem_size, + E820_RAM, E820_RESERVED); } return *p == '\0' ? 0 : -EINVAL; } -- cgit v1.1 From 042623bbabae168246ad8a37693f0ecb6c450aea Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 19:52:15 -0700 Subject: x86: clean up ARCH_SETUP asm-x86/paravirt.h already have protection with CONFIG_PARAVIRT inside Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 8 ++++---- include/asm-x86/mach-default/setup_arch.h | 4 ---- include/asm-x86/mach-visws/setup_arch.h | 2 -- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63dbb5e..161609c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,11 +101,7 @@ #include <asm/proto.h> #include <mach_apic.h> -#ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> -#else -#define ARCH_SETUP -#endif #include <asm/percpu.h> #include <asm/sections.h> @@ -115,6 +111,10 @@ #include <asm/numa_64.h> #endif +#ifndef ARCH_SETUP +#define ARCH_SETUP +#endif + #ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; #else diff --git a/include/asm-x86/mach-default/setup_arch.h b/include/asm-x86/mach-default/setup_arch.h index 605e3cc..3884620 100644 --- a/include/asm-x86/mach-default/setup_arch.h +++ b/include/asm-x86/mach-default/setup_arch.h @@ -1,7 +1,3 @@ /* Hook to call BIOS initialisation function */ /* no action for generic */ - -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif diff --git a/include/asm-x86/mach-visws/setup_arch.h b/include/asm-x86/mach-visws/setup_arch.h index 33f700e..b8f5dd8 100644 --- a/include/asm-x86/mach-visws/setup_arch.h +++ b/include/asm-x86/mach-visws/setup_arch.h @@ -4,5 +4,3 @@ extern unsigned long sgivwfb_mem_phys; extern unsigned long sgivwfb_mem_size; /* no action for visws */ - -#define ARCH_SETUP -- cgit v1.1 From e7b3789524eecc96213dd69d6686efd429235051 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 25 Jun 2008 21:51:28 -0700 Subject: x86: move fix mapping page table range early do that in init_memory_mapping also remove one init_ohci1394_dma_on_all_controllers Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 23 +++++++++-------------- arch/x86/mm/init_32.c | 15 +++++++++++---- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 161609c..bf528b2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -611,11 +611,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 probe_roms(); -#else -# ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - if (init_ohci1394_dma_early) - init_ohci1394_dma_on_all_controllers(); -# endif #endif /* after parse_early_param, so could debug it */ @@ -672,6 +667,15 @@ void __init setup_arch(char **cmdline_p) /* max_pfn_mapped is updated here */ max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT)); + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + reserve_initrd(); #ifdef CONFIG_X86_64 @@ -739,15 +743,6 @@ void __init setup_arch(char **cmdline_p) map_vsyscall(); #endif - /* - * NOTE: On x86-32, only from this point on, fixmaps are ready for use. - */ - -#if defined(CONFIG_PROVIDE_OHCI1394_DMA_INIT) && defined(CONFIG_X86_32) - if (init_ohci1394_dma_early) - init_ohci1394_dma_on_all_controllers(); -#endif - #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 156000d..b9cf7f7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -442,13 +442,10 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init(void) +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) { - pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; - paravirt_pagetable_setup_start(pgd_base); - /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -458,6 +455,13 @@ static void __init pagetable_init(void) end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + + paravirt_pagetable_setup_start(pgd_base); permanent_kmaps_init(pgd_base); @@ -788,6 +792,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, kernel_physical_mapping_init(pgd_base, start, end); + early_ioremap_page_table_range_init(pgd_base); + load_cr3(swapper_pg_dir); __flush_tlb_all(); @@ -799,6 +805,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return end >> PAGE_SHIFT; } + /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. -- cgit v1.1 From 457da70ec09ca78e8fda5d5ab3659248f844a376 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Thu, 26 Jun 2008 07:28:51 -0700 Subject: x86/paravirt: groundwork for 64-bit Xen support, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ingo Molnar wrote: > * Jeremy Fitzhardinge <jeremy@goop.org> wrote: > > >>> It quickly broke the build in testing: >>> >>> include/asm/pgalloc.h: In function ‘paravirt_pgd_free': >>> include/asm/pgalloc.h:14: error: parameter name omitted >>> arch/x86/kernel/entry_64.S: In file included from >>> arch/x86/kernel/traps_64.c:51:include/asm/pgalloc.h: In function >>> ‘paravirt_pgd_free': >>> include/asm/pgalloc.h:14: error: parameter name omitted >>> >>> >> No, looks like my fault. The non-PARAVIRT version of >> paravirt_pgd_free() is: >> >> static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *) {} >> >> but C doesn't like missing parameter names, even if unused. >> >> This should fix it: >> > > that fixed the build but now we've got a boot crash with this config: > > time.c: Detected 2010.304 MHz processor. > spurious 8259A interrupt: IRQ7. > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [<0000000000000000>] > PGD 0 > Thread overran stack, or stack corrupted > Oops: 0010 [1] SMP > CPU 0 > > with: > > http://redhat.com/~mingo/misc/config-Thu_Jun_26_12_46_46_CEST_2008.bad > Use SWAPGS_UNSAFE_STACK in ia32entry.S in the places where the active stack is the usermode stack. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/ia32/ia32entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 9740b6a..24e4d49 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -98,7 +98,7 @@ ENTRY(ia32_sysenter_target) CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp - SWAPGS + SWAPGS_UNSAFE_STACK movq %gs:pda_kernelstack, %rsp addq $(PDA_STACKOFFSET),%rsp /* @@ -210,7 +210,7 @@ ENTRY(ia32_cstar_target) CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ - SWAPGS + SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq %gs:pda_kernelstack,%rsp -- cgit v1.1 From 22b45144f67dbaf0705992dc1462de2813fb83a1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Thu, 26 Jun 2008 12:02:49 -0700 Subject: x86/paravirt: groundwork for 64-bit Xen support, fix #2 Ingo Molnar wrote: > that fixed the build but now we've got a boot crash with this config: > > time.c: Detected 2010.304 MHz processor. > spurious 8259A interrupt: IRQ7. > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [<0000000000000000>] > PGD 0 > Thread overran stack, or stack corrupted > Oops: 0010 [1] SMP > CPU 0 > I don't know if this will fix this bug, but it's definitely a bugfix. It was trashing random pages by overwriting them with pagetables... Don't trash a large pmd's data when mapping physical memory. This is a bugfix for "x86_64: adjust mapping of physical pagetables to work with Xen". Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5c3305e..b10b7f1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -320,7 +320,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) } if (pmd_val(*pmd)) { - phys_pte_update(pmd, address, end); + if (!pmd_large(*pmd)) + phys_pte_update(pmd, address, end); continue; } -- cgit v1.1 From ab67715c7201be2fe729888a09007b6ba5bb2326 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 27 Jun 2008 15:36:54 -0700 Subject: x86: early res print out alignment v2 v2: fix print info to cont Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7b7685b..fa77cb4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -828,16 +828,26 @@ void __init free_early(u64 start, u64 end) void __init early_res_to_bootmem(u64 start, u64 end) { - int i; + int i, count; u64 final_start, final_end; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + + count = 0; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) + count++; + + printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); + for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [ %010llx - %010llx ] %16s", i, + r->start, r->end, r->name); final_start = max(start, r->start); final_end = min(end, r->end); - if (final_start >= final_end) + if (final_start >= final_end) { + printk(KERN_CONT "\n"); continue; - printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, - final_start, final_end - 1, r->name); + } + printk(KERN_CONT " ===> [ %010llx - %010llx ]\n", + final_start, final_end); reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } -- cgit v1.1 From f3294a33e765d8308c3e17b951a13e0db9cf5f00 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Fri, 27 Jun 2008 01:41:56 -0700 Subject: x86: let setup_arch call init_apic_mappings for 32bit instead of calling it from trap_init() also move init ioapic mapping out of apic_32.c so 32 bit do same as 64 bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 30 ------------------------------ arch/x86/kernel/io_apic_32.c | 32 ++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 6 ++---- arch/x86/kernel/traps_32.c | 4 ---- include/asm-x86/apic.h | 1 + include/asm-x86/io_apic.h | 1 + 6 files changed, 36 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index b5571fb..8fbad8e 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1197,36 +1197,6 @@ void __init init_apic_mappings(void) if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); -#ifdef CONFIG_X86_IO_APIC - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } - } else { -fake_ioapic_page: - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } - } -#endif } /* diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index d6af301..337ec34 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> +#include <linux/bootmem.h> #include <linux/mc146818rtc.h> #include <linux/compiler.h> #include <linux/acpi.h> @@ -2852,3 +2853,34 @@ static int __init parse_noapic(char *arg) return 0; } early_param("noapic", parse_noapic); + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mp_apicaddr; + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } + } else { +fake_ioapic_page: + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } +} + diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bf528b2..4716460 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -766,16 +766,14 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif -#ifdef CONFIG_X86_64 init_apic_mappings(); ioapic_init_mappings(); -#else -# if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) if (def_to_bigsmp) printk(KERN_WARNING "More than 8 CPUs detected and " "CONFIG_X86_PC cannot handle it.\nUse " "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -# endif #endif kvm_guest_init(); diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index dc7c05e..f60feee 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1198,10 +1198,6 @@ void __init trap_init(void) early_iounmap(p, 4); #endif -#ifdef CONFIG_X86_LOCAL_APIC - init_apic_mappings(); -#endif - set_trap_gate(0, ÷_error); set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index 9fe941c..6ae6ae6 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -135,6 +135,7 @@ extern int apic_is_clustered_box(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 8b1f568..14f82bb 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -186,6 +186,7 @@ extern void ioapic_init_mappings(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; +static inline void ioapic_init_mappings(void) { } #endif #endif -- cgit v1.1 From df366e9822beca97115ba9745cbe1ea1f26fb111 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Fri, 27 Jun 2008 12:04:03 -0700 Subject: x86_64: fix non-paravirt compilation Make sure SWAPGS and PARAVIRT_ADJUST_EXCEPTION_FRAME are properly defined when CONFIG_PARAVIRT is off. Fixes Ingo's build failure: arch/x86/kernel/entry_64.S: Assembler messages: arch/x86/kernel/entry_64.S:1201: Error: invalid character '_' in mnemonic arch/x86/kernel/entry_64.S:1205: Error: invalid character '_' in mnemonic arch/x86/kernel/entry_64.S:1209: Error: invalid character '_' in mnemonic arch/x86/kernel/entry_64.S:1213: Error: invalid character '_' in mnemonic Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mark McLoughlin <markmc@redhat.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Stephen Tweedie <sct@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/irqflags.h | 22 +++++++++++++--------- include/asm-x86/processor.h | 3 --- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index d17e1f6..17e7a17 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -111,7 +111,20 @@ static inline unsigned long __raw_local_irq_save(void) #define DISABLE_INTERRUPTS(x) cli #ifdef CONFIG_X86_64 +#define SWAPGS swapgs +/* + * Currently paravirt can't handle swapgs nicely when we + * don't have a stack we can rely on (such as a user space + * stack). So we either find a way around these or just fault + * and emulate if a guest tries to call swapgs directly. + * + * Either way, this is a good way to document that we don't + * have a reliable stack. x86_64 only. + */ #define SWAPGS_UNSAFE_STACK swapgs + +#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ + #define INTERRUPT_RETURN iretq #define USERGS_SYSRET64 \ swapgs; \ @@ -177,15 +190,6 @@ static inline void trace_hardirqs_fixup(void) #else #ifdef CONFIG_X86_64 -/* - * Currently paravirt can't handle swapgs nicely when we - * don't have a stack we can rely on (such as a user space - * stack). So we either find a way around these or just fault - * and emulate if a guest tries to call swapgs directly. - * - * Either way, this is a good way to document that we don't - * have a reliable stack. x86_64 only. - */ #define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk #define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index df2459f..7f73827 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -531,9 +531,6 @@ static inline void load_sp0(struct tss_struct *tss, } #define set_iopl_mask native_set_iopl_mask -#define SWAPGS swapgs - -#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */ #endif /* CONFIG_PARAVIRT */ /* -- cgit v1.1 From 7482b0e962e128c5b574aa29761f97164189ef14 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 28 Jun 2008 03:30:39 -0700 Subject: x86: fix init_memory_mapping over boundary v3 some ram-end boundary only has page alignment, instead of 2M alignment. v2: make init_memory_mapping more solid: start could be any value other than 0 v3: fix NON PAE by handling left over in kernel_physical_mapping Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b9cf7f7..90ca67b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -195,7 +195,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, unsigned pages_2m = 0, pages_4k = 0; unsigned limit_pfn = end >> PAGE_SHIFT; - pgd_idx = pgd_index(PAGE_OFFSET); + pgd_idx = pgd_index(start + PAGE_OFFSET); pgd = pgd_base + pgd_idx; pfn = start >> PAGE_SHIFT; @@ -218,7 +218,8 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { + if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0) && + (pfn + PTRS_PER_PTE) <= limit_pfn) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -233,13 +234,12 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - max_pfn_mapped = pfn; continue; } pte = one_page_table_init(pmd); for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte_ofs < PTRS_PER_PTE && pfn < limit_pfn; pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { pgprot_t prot = PAGE_KERNEL; @@ -249,7 +249,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pages_4k++; set_pte(pte, pfn_pte(pfn, prot)); } - max_pfn_mapped = pfn; } } update_page_count(PG_LEVEL_2M, pages_2m); @@ -729,7 +728,7 @@ void __init setup_bootmem_allocator(void) static void __init find_early_table_space(unsigned long end) { - unsigned long puds, pmds, tables, start; + unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = PAGE_ALIGN(puds * sizeof(pud_t)); @@ -737,10 +736,15 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (!cpu_has_pse) { - int ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); - } + if (cpu_has_pse) { + unsigned long extra; + extra = end - ((end>>21) << 21); + extra += (2UL<<20); + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += PAGE_ALIGN(ptes * sizeof(pte_t)); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.1 From b4df32f4aeef8794d0135fc8dc250acb44cfee60 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sat, 28 Jun 2008 17:49:59 -0700 Subject: x86: fix warning in e820_reserve_resources with 32bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when 64bit resource is not enabled, we get: arch/x86/kernel/e820.c: In function ‘e820_reserve_resources’: arch/x86/kernel/e820.c:1217: warning: comparison is always false due to limited range of data type because res->start/end is resource_t aka u32. it will overflow. fix it with temp end of u64 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index fa77cb4..ba5ac88 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1202,6 +1202,7 @@ void __init e820_reserve_resources(void) { int i; struct resource *res; + u64 end; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { @@ -1211,14 +1212,16 @@ void __init e820_reserve_resources(void) case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; default: res->name = "reserved"; } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; + end = e820.map[i].addr + e820.map[i].size - 1; #ifndef CONFIG_RESOURCES_64BIT - if (res->end > 0x100000000ULL) { + if (end > 0x100000000ULL) { res++; continue; } #endif + res->start = e820.map[i].addr; + res->end = end; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); res++; -- cgit v1.1 From a04ad82d0bff4bb564f290eb50982e02458592d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 29 Jun 2008 00:39:06 -0700 Subject: x86: fix init_memory_mapping over boundary, v4 use PMD_SHIFT to calculate boundary also adjust size for pre-allocated table size Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 89 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 90ca67b..aa5e37c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -184,8 +184,9 @@ static inline int is_kernel_text(unsigned long addr) * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start, - unsigned long end) + unsigned long start_pfn, + unsigned long end_pfn, + int use_pse) { int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; @@ -193,33 +194,33 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pmd_t *pmd; pte_t *pte; unsigned pages_2m = 0, pages_4k = 0; - unsigned limit_pfn = end >> PAGE_SHIFT; - pgd_idx = pgd_index(start + PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = start >> PAGE_SHIFT; + if (!cpu_has_pse) + use_pse = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pgd = pgd_base + pgd_idx; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); - if (pfn >= limit_pfn) - continue; - for (pmd_idx = 0; - pmd_idx < PTRS_PER_PMD && pfn < limit_pfn; + if (pfn >= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pmd += pmd_idx; +#else + pmd_idx = 0; +#endif + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; pmd++, pmd_idx++) { unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; /* * Map with big pages if possible, otherwise * create normal page tables: - * - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. */ - if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0) && - (pfn + PTRS_PER_PTE) <= limit_pfn) { + if (use_pse) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; @@ -238,8 +239,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, } pte = one_page_table_init(pmd); - for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < limit_pfn; + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { pgprot_t prot = PAGE_KERNEL; @@ -738,14 +740,18 @@ static void __init find_early_table_space(unsigned long end) if (cpu_has_pse) { unsigned long extra; - extra = end - ((end>>21) << 21); - extra += (2UL<<20); + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + extra += PMD_SIZE; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + /* for fixmap */ + tables += PAGE_SIZE * 2; + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables @@ -770,6 +776,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { pgd_t *pgd_base = swapper_pg_dir; + unsigned long start_pfn, end_pfn; + unsigned long big_page_start; /* * Find space for the kernel direct mapping tables. @@ -794,7 +802,44 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } - kernel_physical_mapping_init(pgd_base, start, end); + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + big_page_start = PMD_SIZE; + + if (start < big_page_start) { + start_pfn = start >> PAGE_SHIFT; + end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); + } else { + /* head is not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + } + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); + + /* big page range */ + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < (big_page_start >> PAGE_SHIFT)) + start_pfn = big_page_start >> PAGE_SHIFT; + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, + cpu_has_pse); + + /* tail is not big page alignment ? */ + start_pfn = end_pfn; + if (start_pfn > (big_page_start>>PAGE_SHIFT)) { + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) + kernel_physical_mapping_init(pgd_base, start_pfn, + end_pfn, 0); + } early_ioremap_page_table_range_init(pgd_base); -- cgit v1.1 From 914bebfad42c417b84bda8920a3073d236007fde Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 29 Jun 2008 00:06:37 -0700 Subject: x86: use disable_apic in 32bit change the enable_local_apic to static force_enable_local_apic for 32bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/apic_32.c | 15 ++++++++------- arch/x86/kernel/setup.c | 4 ---- include/asm-x86/apic.h | 4 ---- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 8fbad8e..6dea830 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -55,9 +55,10 @@ unsigned long mp_lapic_addr; /* * Knob to control our willingness to enable the local APIC. * - * -1=force-disable, +1=force-enable + * +1=force-enable */ -int enable_local_apic; +static int force_enable_local_apic; +int disable_apic; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; @@ -1099,7 +1100,7 @@ static int __init detect_init_APIC(void) u32 h, l, features; /* Disabled by kernel option? */ - if (enable_local_apic < 0) + if (disable_apic) return -1; switch (boot_cpu_data.x86_vendor) { @@ -1122,7 +1123,7 @@ static int __init detect_init_APIC(void) * Over-ride BIOS and try to enable the local APIC only if * "lapic" specified. */ - if (enable_local_apic <= 0) { + if (!force_enable_local_apic) { printk(KERN_INFO "Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; @@ -1208,7 +1209,7 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (enable_local_apic < 0) + if (disable_apic) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); if (!smp_found_config && !cpu_has_apic) @@ -1682,14 +1683,14 @@ static void apic_pm_activate(void) { } */ static int __init parse_lapic(char *arg) { - enable_local_apic = 1; + force_enable_local_apic = 1; return 0; } early_param("lapic", parse_lapic); static int __init parse_nolapic(char *arg) { - enable_local_apic = -1; + disable_apic = 1; clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4716460..fb318ed 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -598,12 +598,8 @@ void __init setup_arch(char **cmdline_p) if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC -#ifdef CONFIG_X86_32 - enable_local_apic = -1; -#else disable_apic = 1; #endif -#endif clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); } diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index 6ae6ae6..a298077 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -40,11 +40,7 @@ extern int local_apic_timer_c2_ok; extern int ioapic_force; -#ifdef CONFIG_X86_64 extern int disable_apic; -#else -extern int enable_local_apic; -#endif /* * Basic functions accessing APICs. */ -- cgit v1.1 From afda335dc3872ca122842e26720ac6e6ef287aa2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Fri, 27 Jun 2008 19:43:40 +0400 Subject: x86: nmi_watchdog - documentation fix nmi_watchdog is set to NMI_NONE by default (ie disabled) on _any_ mode so lets fix documentation too. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: "Maciej W. Rozycki" <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/nmi_watchdog.txt | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/Documentation/nmi_watchdog.txt b/Documentation/nmi_watchdog.txt index 757c729..101bfcf 100644 --- a/Documentation/nmi_watchdog.txt +++ b/Documentation/nmi_watchdog.txt @@ -10,7 +10,7 @@ us to generate 'watchdog NMI interrupts'. (NMI: Non Maskable Interrupt which get executed even if the system is otherwise locked up hard). This can be used to debug hard kernel lockups. By executing periodic NMI interrupts, the kernel can monitor whether any CPU has locked up, -and print out debugging messages if so. +and print out debugging messages if so. In order to use the NMI watchdog, you need to have APIC support in your kernel. For SMP kernels, APIC support gets compiled in automatically. For @@ -22,8 +22,7 @@ CONFIG_X86_UP_IOAPIC is for uniprocessor with an IO-APIC. [Note: certain kernel debugging options, such as Kernel Stack Meter or Kernel Tracer, may implicitly disable the NMI watchdog.] -For x86-64, the needed APIC is always compiled in, and the NMI watchdog is -always enabled with I/O-APIC mode (nmi_watchdog=1). +For x86-64, the needed APIC is always compiled in. Using local APIC (nmi_watchdog=2) needs the first performance register, so you can't use it for other purposes (such as high precision performance @@ -63,16 +62,15 @@ when the system is idle), but if your system locks up on anything but the "hlt", then you are out of luck -- the event will not happen at all and the watchdog won't trigger. This is a shortcoming of the local APIC watchdog -- unfortunately there is no "clock ticks" event that would work all the -time. The I/O APIC watchdog is driven externally and has no such shortcoming. +time. The I/O APIC watchdog is driven externally and has no such shortcoming. But its NMI frequency is much higher, resulting in a more significant hit to the overall system performance. -NOTE: starting with 2.4.2-ac18 the NMI-oopser is disabled by default, -you have to enable it with a boot time parameter. Prior to 2.4.2-ac18 -the NMI-oopser is enabled unconditionally on x86 SMP boxes. +On x86 nmi_watchdog is disabled by default so you have to enable it with +a boot time parameter. -On x86-64 the NMI oopser is on by default. On 64bit Intel CPUs -it uses IO-APIC by default and on AMD it uses local APIC. +NOTE: Prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally +on x86 SMP boxes. [ feel free to send bug reports, suggestions and patches to Ingo Molnar <mingo@redhat.com> or the Linux SMP mailing -- cgit v1.1 From 1bb3a029078d437aa05bda8a8c8f8ecb1265e231 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 30 Jun 2008 08:47:42 +0200 Subject: x86: nmi_watchdog - documentation fix - v2 typo fixes from Randy Dunlap and Alan Cox. Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/nmi_watchdog.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/nmi_watchdog.txt b/Documentation/nmi_watchdog.txt index 101bfcf..90aa453 100644 --- a/Documentation/nmi_watchdog.txt +++ b/Documentation/nmi_watchdog.txt @@ -69,7 +69,7 @@ to the overall system performance. On x86 nmi_watchdog is disabled by default so you have to enable it with a boot time parameter. -NOTE: Prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally +NOTE: In kernels prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally on x86 SMP boxes. [ feel free to send bug reports, suggestions and patches to -- cgit v1.1 From 1a98fd14f44cfade4af3e6ed96ba55065fa17ee4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Sun, 29 Jun 2008 20:02:44 -0700 Subject: x86: setup_arch() && early_ioremap_init() Looks like the setup.c unification missed the early_ioremap init from the early_ioremap unification. Unconditionally call early_ioremap_init(). needed for "x86/paravirt: groundwork for 64-bit Xen support". Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mark McLoughlin <markmc@redhat.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fb318ed..caec79f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -523,12 +523,13 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); - early_ioremap_init(); reserve_setup_data(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_ioremap_init(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; -- cgit v1.1 From 3ae960a598b9dfe87b29eb70738d91a13e692498 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 30 Jun 2008 10:33:47 +0200 Subject: - x86: move early_ioremap prototypes to io.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit now that the early-ioremap code is unified, move the prototypes too from io_32.h to io.h. this fixes: arch/x86/kernel/setup.c:531: error: implicit declaration of function ‘early_ioremap_init' on 64-bit. Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/io.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/asm-x86/io.h b/include/asm-x86/io.h index c63563d..bf5d629 100644 --- a/include/asm-x86/io.h +++ b/include/asm-x86/io.h @@ -5,6 +5,20 @@ #include <linux/compiler.h> +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +#ifndef __ASSEMBLY__ +extern void early_ioremap_init(void); +extern void early_ioremap_clear(void); +extern void early_ioremap_reset(void); +extern void *early_ioremap(unsigned long offset, unsigned long size); +extern void early_iounmap(void *addr, unsigned long size); +extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); +#endif + #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":"=" reg (ret) \ -- cgit v1.1 From 102d0a4b56d94e9b7eedfdfb488400271235543f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 30 Jun 2008 11:10:53 -0700 Subject: x86, paravirt, 64-bit: fix compile errors with IA32_EMULATION off Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/paravirt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 27819e3..e7e5652 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -337,9 +337,13 @@ struct pv_cpu_ops pv_cpu_ops = { .write_idt_entry = native_write_idt_entry, .load_sp0 = native_load_sp0, +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) .irq_enable_sysexit = native_irq_enable_sysexit, +#endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_IA32_EMULATION .usergs_sysret32 = native_usergs_sysret32, +#endif .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, -- cgit v1.1 From 28bb22379513ca3cac9d13766064a219c5fc21a9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 30 Jun 2008 16:20:54 -0700 Subject: x86: move reserve_setup_data to setup.c Ying Huang would like setup_data to be reserved, but not included in the no save range. Here we try to modify the e820 table to reserve that range early. also add that in early_res in case bootloader messes up with the ramdisk. other solution would be 1. add early_res_to_highmem... 2. early_res_to_e820... but they could reserve another type memory wrongly, if early_res has some resource reserved early, and not needed later, but it is not removed from early_res in time. Like the RAMDISK (already handled). Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: andi@firstfloor.org Tested-by: Huang, Ying <ying.huang@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 4 +++- arch/x86/kernel/head.c | 18 ------------------ arch/x86/kernel/head64.c | 1 - arch/x86/kernel/setup.c | 34 ++++++++++++++++++++++++++++------ include/asm-x86/bootparam.h | 2 -- include/asm-x86/e820.h | 3 +++ 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ba5ac88..e03b89a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -120,6 +120,7 @@ void __init e820_print_map(char *who) (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { case E820_RAM: + case E820_RESERVED_KERN: printk(KERN_CONT "(usable)\n"); break; case E820_RESERVED: @@ -611,7 +612,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) register_nosave_region(pfn, PFN_UP(ei->addr)); pfn = PFN_DOWN(ei->addr + ei->size); - if (ei->type != E820_RAM) + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) register_nosave_region(PFN_UP(ei->addr), pfn); if (pfn >= limit_pfn) @@ -1207,6 +1208,7 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { switch (e820.map[i].type) { + case E820_RESERVED_KERN: case E820_RAM: res->name = "System RAM"; break; case E820_ACPI: res->name = "ACPI Tables"; break; case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index a6816be..3e66bd3 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -53,21 +53,3 @@ void __init reserve_ebda_region(void) /* reserve all memory between lowmem and the 1MB mark */ reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); } - -void __init reserve_setup_data(void) -{ - struct setup_data *data; - u64 pa_data; - char buf[32]; - - if (boot_params.hdr.version < 0x0209) - return; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); - pa_data = data->next; - early_iounmap(data, sizeof(*data)); - } -} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index f684e3b..c978198 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -128,7 +128,6 @@ void __init x86_64_start_reservations(char *real_mode_data) #endif reserve_ebda_region(); - reserve_setup_data(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index caec79f..2ca12d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -389,14 +389,34 @@ static void __init parse_setup_data(void) default: break; } -#ifndef CONFIG_DEBUG_BOOT_PARAMS - free_early(pa_data, pa_data+sizeof(*data)+data->len); -#endif pa_data = data->next; early_iounmap(data, PAGE_SIZE); } } +static void __init reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + e820_update_range(pa_data, sizeof(*data)+data->len, + E820_RAM, E820_RESERVED_KERN); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "extended physical RAM map:\n"); + e820_print_map("reserve setup_data"); +} + /* * --------- Crashkernel reservation ------------------------------ */ @@ -523,7 +543,6 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); - reserve_setup_data(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif @@ -567,6 +586,8 @@ void __init setup_arch(char **cmdline_p) ARCH_SETUP setup_memory_map(); + parse_setup_data(); + copy_edd(); if (!boot_params.hdr.root_flags) @@ -593,10 +614,11 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; - parse_setup_data(); - parse_early_param(); + /* after early param, so could get panic from serial */ + reserve_setup_data(); + if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index 6eeba3b..ae22bdf 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -108,6 +108,4 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -void reserve_setup_data(void); - #endif /* _ASM_BOOTPARAM_H */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index f622685..45e904f 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -44,6 +44,9 @@ #define E820_ACPI 3 #define E820_NVS 4 +/* reserved RAM used by kernel itself */ +#define E820_RESERVED_KERN 128 + #ifndef __ASSEMBLY__ struct e820entry { __u64 addr; /* start of memory segment */ -- cgit v1.1 From 996cf4438f9ed711c98a3cb5ab88f842a4102427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Mon, 30 Jun 2008 18:34:58 -0700 Subject: x86: don't reallocate pgt for node0 kva ram already mapped right after away, so don't need to get that for low ram. avoid wasting one copy of pgdat. also add node id in early_res name in case we get it from find_e820_area. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/discontig_32.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 42484d4..e2f0dd1 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,17 +156,20 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, - (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT, + max_pfn_mapped<<PAGE_SHIFT, sizeof(pg_data_t), PAGE_SIZE); NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), - "NODE_DATA"); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); } printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", nid, (unsigned long)NODE_DATA(nid)); -- cgit v1.1 From cd5dce2fb023a6f0168344b7dd8adec30017458e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Mon, 30 Jun 2008 16:04:48 -0700 Subject: x86: fix CPA self-test for "x86/paravirt: groundwork for 64-bit Xen support" Ingo Molnar wrote: > -tip auto-testing found pagetable corruption (CPA self-test failure): > > [ 32.956015] CPA self-test: > [ 32.958822] 4k 2048 large 508 gb 0 x 2556[ffff880000000000-ffff88003fe00000] miss 0 > [ 32.964000] CPA ffff88001d54e000: bad pte 1d4000e3 > [ 32.968000] CPA ffff88001d54e000: unexpected level 2 > [ 32.972000] CPA ffff880022c5d000: bad pte 22c000e3 > [ 32.976000] CPA ffff880022c5d000: unexpected level 2 > [ 32.980000] CPA ffff8800200ce000: bad pte 200000e3 > [ 32.984000] CPA ffff8800200ce000: unexpected level 2 > [ 32.988000] CPA ffff8800210f0000: bad pte 210000e3 > > config and full log can be found at: > > http://redhat.com/~mingo/misc/config-Mon_Jun_30_11_11_51_CEST_2008.bad > http://redhat.com/~mingo/misc/log-Mon_Jun_30_11_11_51_CEST_2008.bad Phew. OK, I've worked this out. Short version is that's it's a false alarm, and there was no real failure here. Long version: * I changed the code to create the physical mapping pagetables to reuse any existing mapping rather than replace it. Specifically, reusing an pud pointed to by the pgd caused this symptom to appear. * The specific PUD being reused is the one created statically in head_64.S, which creates an initial 1GB mapping. * That mapping doesn't have _PAGE_GLOBAL set on it, due to the inconsistency between __PAGE_* and PAGE_*. * The CPA test attempts to clear _PAGE_GLOBAL, and then checks to see that the resulting range is 1) shattered into 4k pages, and 2) has no _PAGE_GLOBAL. * However, since it didn't have _PAGE_GLOBAL on that range to start with, change_page_attr_clear() had nothing to do, and didn't bother shattering the range, * resulting in the reported messages The simple fix is to set _PAGE_GLOBAL in level2_ident_pgt. An additional fix to make CPA testing more robust by using some other pagetable bit (one of the unused available-to-software ones). This would solve spurious CPA test warnings under Xen which uses _PAGE_GLOBAL for its own purposes (ie, not under guest control). Also, we should revisit the use of _PAGE_GLOBAL in asm-x86/pgtable.h, and use it consistently, and drop MAKE_GLOBAL. The first time I proposed it it caused breakages in the very early CPA code; with luck that's all fixed now. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mark McLoughlin <markmc@redhat.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 38279fa..4336f98 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* -- cgit v1.1 From 6a2f47ca27fad36f99e8478a3807d4b8c7db80e7 Mon Sep 17 00:00:00 2001 From: Mike Travis <travis@sgi.com> Date: Fri, 27 Jun 2008 10:10:13 -0700 Subject: x86: add check for node passed to node_to_cpumask, v3 * When CONFIG_DEBUG_PER_CPU_MAPS is set, the node passed to node_to_cpumask and node_to_cpumask_ptr should be validated. If invalid, then a dump_stack is performed and a zero cpumask is returned. v2: Slightly different version to remove a compiler warning. v3: Redone to reflect moving setup.c -> setup_percpu.c Signed-off-by: Mike Travis <travis@sgi.com> Cc: Vegard Nossum <vegard.nossum@gmail.com> Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org> Cc: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup_percpu.c | 26 +++++++++++++++++++++++--- include/asm-x86/topology.h | 7 ++++++- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7068f95..43aca2d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -346,6 +346,10 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } + +/* empty cpumask */ +static const cpumask_t cpu_mask_none; + /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ @@ -358,13 +362,23 @@ cpumask_t *_node_to_cpumask_ptr(int node) dump_stack(); return &cpu_online_map; } - BUG_ON(node >= nr_node_ids); - return &node_to_cpumask_map[node]; + if (node >= nr_node_ids) { + printk(KERN_WARNING + "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return (cpumask_t *)&cpu_mask_none; + } + return (cpumask_t *)&node_to_cpumask_map[node]; } EXPORT_SYMBOL(_node_to_cpumask_ptr); /* * Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. */ cpumask_t node_to_cpumask(int node) { @@ -374,7 +388,13 @@ cpumask_t node_to_cpumask(int node) dump_stack(); return cpu_online_map; } - BUG_ON(node >= nr_node_ids); + if (node >= nr_node_ids) { + printk(KERN_WARNING + "node_to_cpumask(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_mask_none; + } return node_to_cpumask_map[node]; } EXPORT_SYMBOL(node_to_cpumask); diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 1f97758..98e5f17 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -57,7 +57,12 @@ static inline int cpu_to_node(int cpu) } #define early_cpu_to_node(cpu) cpu_to_node(cpu) -/* Returns a bitmask of CPUs on Node 'node'. */ +/* Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ static inline cpumask_t node_to_cpumask(int node) { return node_to_cpumask_map[node]; -- cgit v1.1 From fd6493e16625b92a506fba13deda31c0be5f1cd4 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Wed, 25 Jun 2008 11:02:42 -0700 Subject: x86: cleanup e820_setup_gap(), v2 e820_search_gap also take a end_addr parameter to limit search from start_addr to end_addr. Signed-off-by: AloK N Kataria <akataria@vmware.com> Acked-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: "lenb@kernel.org" <lenb@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 12 +++++++----- include/asm-x86/e820.h | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e03b89a..d3f6c5f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -487,17 +487,19 @@ void __init update_e820(void) printk(KERN_INFO "modified physical RAM map:\n"); e820_print_map("modified"); } - +#define MAX_GAP_END 0x100000000ull /* - * Search for a gap in the e820 memory space from start_addr to 2^32. + * Search for a gap in the e820 memory space from start_addr to end_addr. */ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr) + unsigned long start_addr, unsigned long long end_addr) { - unsigned long long last = 0x100000000ull; + unsigned long long last; int i = e820.nr_map; int found = 0; + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; + while (--i >= 0) { unsigned long long start = e820.map[i].addr; unsigned long long end = start + e820.map[i].size; @@ -537,7 +539,7 @@ __init void e820_setup_gap(void) gapstart = 0x10000000; gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize, 0); + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); #ifdef CONFIG_X86_64 if (!found) { diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 45e904f..05a7dc3 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -75,7 +75,7 @@ extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, extern void update_e820(void); extern void e820_setup_gap(void); extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, - unsigned long start_addr); + unsigned long start_addr, unsigned long long end_addr); struct setup_data; extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); -- cgit v1.1 From 32105f7fd8faa7bc3d101dcc3eabc0ae1ac375a7 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Thu, 26 Jun 2008 21:54:08 +0200 Subject: x86: find offset for crashkernel reservation automatically This patch removes the need of the crashkernel=...@offset parameter to define a fixed offset for crashkernel reservation. That feature can be used together with a relocatable kernel where the kexec-tools relocate the kernel and get the actual offset from /proc/iomem. The use case is a kernel where the .text+.data+.bss is after 16M physical memory (debug kernel with lockdep on x86_64 can cause that) which caused a major pain in autoconfiguration in our distribution. Also, that patch unifies crashdump architectures a bit since IA64 has that semantics from the very beginning of the kdump port. Signed-off-by: Bernhard Walle <bwalle@suse.de> Cc: vgoyal@redhat.com Cc: Bernhard Walle <bwalle@suse.de> Cc: kexec@lists.infradead.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 70 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2ca12d4..b346989 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -422,6 +422,34 @@ static void __init reserve_setup_data(void) */ #ifdef CONFIG_KEXEC + +/** + * Reserve @size bytes of crashkernel memory at any suitable offset. + * + * @size: Size of the crashkernel memory to reserve. + * Returns the base address on success, and -1ULL on failure. + */ +unsigned long long find_and_reserve_crashkernel(unsigned long long size) +{ + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long start = 0LL; + + while (1) { + int ret; + + start = find_e820_area(start, ULONG_MAX, size, alignment); + if (start == -1ULL) + return start; + + /* try to reserve it */ + ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); + if (ret >= 0) + return start; + + start += alignment; + } +} + static inline unsigned long long get_total_mem(void) { unsigned long long total; @@ -444,30 +472,36 @@ static void __init reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - if (crash_base <= 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "you have to specify a base address\n"); + if (ret != 0 || crash_size <= 0) + return; + + /* 0 means: find the address automatically */ + if (crash_base <= 0) { + crash_base = find_and_reserve_crashkernel(crash_size); + if (crash_base == -1ULL) { + pr_info("crashkernel reservation failed. " + "No suitable area found.\n"); return; } - - if (reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE) < 0) { - printk(KERN_INFO "crashkernel reservation failed - " - "memory is in use\n"); + } else { + ret = reserve_bootmem_generic(crash_base, crash_size, + BOOTMEM_EXCLUSIVE); + if (ret < 0) { + pr_info("crashkernel reservation failed - " + "memory is in use\n"); return; } + } - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - } + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); } #else static void __init reserve_crashkernel(void) -- cgit v1.1 From 068b453834c4baf4e878481a9bd5103d54f60710 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Thu, 26 Jun 2008 21:55:18 +0200 Subject: x86: fix documentation bug about relocatability This patch fixes a small bug in documentation: x86_64 also has now the ability to build a relocatable kernel. Signed-off-by: Bernhard Walle <bwalle@suse.de> Cc: vgoyal@redhat.com Cc: kexec@lists.infradead.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/kdump/kdump.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index b8e52c0..9691c7f 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -109,7 +109,7 @@ There are two possible methods of using Kdump. 2) Or use the system kernel binary itself as dump-capture kernel and there is no need to build a separate dump-capture kernel. This is possible only with the architecutres which support a relocatable kernel. As - of today i386 and ia64 architectures support relocatable kernel. + of today, i386, x86_64 and ia64 architectures support relocatable kernel. Building a relocatable kernel is advantageous from the point of view that one does not have to build a second kernel for capturing the dump. But -- cgit v1.1 From dc8e8120ad291074a5fb93cfb0418466c62f6019 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 1 Jul 2008 20:02:16 -0700 Subject: x86: change copy_e820_map to append_e820_map so it has a more meaningful name. also change it to static. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 11 ++++++----- include/asm-x86/e820.h | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d3f6c5f..b01fa0d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -360,7 +360,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, return 0; } -static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map) +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) { while (nr_map) { u64 start = biosmap->addr; @@ -389,13 +389,13 @@ static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -int __init copy_e820_map(struct e820entry *biosmap, int nr_map) +static int __init append_e820_map(struct e820entry *biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) return -1; - return __copy_e820_map(biosmap, nr_map); + return __append_e820_map(biosmap, nr_map); } u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, @@ -583,7 +583,7 @@ void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) if (map_len > PAGE_SIZE) sdata = early_ioremap(pa_data, map_len); extmap = (struct e820entry *)(sdata->data); - __copy_e820_map(extmap, entries); + __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); if (map_len > PAGE_SIZE) early_iounmap(sdata, map_len); @@ -1247,7 +1247,8 @@ char *__init default_machine_specific_memory_setup(void) ARRAY_SIZE(boot_params.e820_map), &new_nr); boot_params.e820_entries = new_nr; - if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) { + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) + < 0) { u64 mem_size; /* compare results from other methods and take the greater */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 05a7dc3..4b26604 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -67,7 +67,6 @@ extern void e820_add_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); -extern int copy_e820_map(struct e820entry *biosmap, int nr_map); extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type); extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, -- cgit v1.1 From 4fcc545a7479135332f511a54611820c9f4208a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 1 Jul 2008 20:03:11 -0700 Subject: x86: make early_res_to_bootmem print out less 80 width chars Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b01fa0d..d033585 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -841,7 +841,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [ %010llx - %010llx ] %16s", i, + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, r->start, r->end, r->name); final_start = max(start, r->start); final_end = min(end, r->end); @@ -849,7 +849,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) printk(KERN_CONT "\n"); continue; } - printk(KERN_CONT " ===> [ %010llx - %010llx ]\n", + printk(KERN_CONT " ==> [%010llx - %010llx]\n", final_start, final_end); reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); -- cgit v1.1 From d9a81b4411d53196c4535c3a1258cb03d945c718 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 1 Jul 2008 20:04:10 -0700 Subject: x86: do not printout if we do not find setup_data Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b346989..4ac01d0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -399,6 +399,7 @@ static void __init reserve_setup_data(void) struct setup_data *data; u64 pa_data; char buf[32]; + int found = 0; if (boot_params.hdr.version < 0x0209) return; @@ -409,9 +410,13 @@ static void __init reserve_setup_data(void) reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); + found = 1; pa_data = data->next; early_iounmap(data, sizeof(*data)); } + if (!found) + return; + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); -- cgit v1.1 From cb95a13a8ace8612ecab042a838e5aab2ec14ef0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 2 Jul 2008 00:31:02 -0700 Subject: x86: merge zones_sizes_init for numa and non numa on 32-bit move out e820_register_active_regions from non numa zones_sizes_init() and remove numa version zones_sizes_init(). and let 32 bit call remove_all_active_ranges() in setup_arch() directly like 64-bit Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 2 -- arch/x86/mm/discontig_32.c | 16 ---------------- arch/x86/mm/init_32.c | 10 ++++------ include/asm-x86/page_32.h | 1 - 4 files changed, 4 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4ac01d0..d5de157 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); -#ifdef CONFIG_X86_64 /* Remove active ranges so rediscovery with NUMA-awareness happens */ remove_all_active_ranges(); -#endif #ifdef CONFIG_ACPI_NUMA /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index e2f0dd1..fa389d2 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -327,7 +327,6 @@ void __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -390,21 +389,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); - return; -} - void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index aa5e37c..8efe872 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -660,12 +660,14 @@ void __init initmem_init(unsigned long start_pfn, if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; memory_present(0, 0, highend_pfn); + e820_register_active_regions(0, 0, highend_pfn); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else memory_present(0, 0, max_low_pfn); + e820_register_active_regions(0, 0, max_low_pfn); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -677,25 +679,21 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ -void __init zone_sizes_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - remove_all_active_ranges(); #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - e820_register_active_regions(0, 0, highend_pfn); -#else - e820_register_active_regions(0, 0, max_low_pfn); #endif free_area_init_nodes(max_zone_pfns); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 4ae1dab..ab85287 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -96,7 +96,6 @@ extern void find_low_pfn_range(void); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(unsigned long, unsigned long); -extern void zone_sizes_init(void); extern void setup_bootmem_allocator(void); -- cgit v1.1 From 5f4765f96eebee6a0adc4009758b597ba48a0a3a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 2 Jul 2008 18:53:44 -0700 Subject: x86: move init_cpu_to_node after get_smp_config when acpi=off, cpu_to_apicid is ready after get_smp_config so need to move init_cpu_to_node after it. otherwise, we will get wrong cpu->node mapping, and it will rely on amd_detect_cmp() to correct it - but that is too late as setup_per_cpu_data is already called before that so we will get per_cpu_data on the wrong node. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d5de157..f52a6fb 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -810,10 +810,6 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#ifdef CONFIG_X86_64 - init_cpu_to_node(); -#endif - #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: @@ -822,6 +818,10 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif +#ifdef CONFIG_X86_64 + init_cpu_to_node(); +#endif + init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.1 From 329513a35d1a2b6b28d54f5c2c0dde4face8200b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Wed, 2 Jul 2008 18:54:40 -0700 Subject: x86: move prefill_possible_map calling early call it right after we are done with MADT/mptable handling, instead of doing that in setup_per_cpu_areas() later on... this way for_possible_cpu() can be used early. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 1 + arch/x86/kernel/setup_percpu.c | 10 ---------- arch/x86/kernel/smpboot.c | 8 ++++++++ include/asm-x86/smp.h | 4 ++++ 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f52a6fb..cfcfbef 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -818,6 +818,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif + prefill_possible_map(); #ifdef CONFIG_X86_64 init_cpu_to_node(); #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 43aca2d..5fc310f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -162,16 +162,6 @@ void __init setup_per_cpu_areas(void) char *ptr; int cpu; - /* no processor from mptable or madt */ - if (!num_processors) - num_processors = 1; - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#else - nr_cpu_ids = num_processors; -#endif - /* Setup cpu_pda map */ setup_cpu_pda_map(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3b19441..e1200b2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1278,12 +1278,20 @@ __init void prefill_possible_map(void) int i; int possible; + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + +#ifdef CONFIG_HOTPLUG_CPU if (additional_cpus == -1) { if (disabled_cpus > 0) additional_cpus = disabled_cpus; else additional_cpus = 0; } +#else + additional_cpus = 0; +#endif possible = num_processors + additional_cpus; if (possible > NR_CPUS) possible = NR_CPUS; diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index fad45f6..b324a06 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -119,6 +119,10 @@ static inline int num_booting_cpus(void) { return cpus_weight(cpu_callout_map); } +#else +static inline void prefill_possible_map(void) +{ +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; -- cgit v1.1 From 4a7017370aa0a94a00ae5b5705e9169cdcae5fb8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 3 Jul 2008 15:57:47 +0200 Subject: x86: move prefill_possible_map calling early, fix fix: arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `prefill_possible_map' Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/smp.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index b324a06..2e221f1 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -109,8 +109,6 @@ int native_cpu_up(unsigned int cpunum); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); -extern void prefill_possible_map(void); - void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) @@ -119,11 +117,15 @@ static inline int num_booting_cpus(void) { return cpus_weight(cpu_callout_map); } +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_CPU) +extern void prefill_possible_map(void); #else static inline void prefill_possible_map(void) { } -#endif /* CONFIG_SMP */ +#endif extern unsigned disabled_cpus __cpuinitdata; -- cgit v1.1 From aea5f9f89bae5d5f9eb3fe3cddedbbfb82e6e44f Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Fri, 4 Jul 2008 12:16:55 +0200 Subject: x86: fix "x86: let setup_arch call init_apic_mappings for 32bit" add back this line lost from trap_init(): set_trap_gate(0, ÷_error); Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index f60feee..d7cc292 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1198,6 +1198,7 @@ void __init trap_init(void) early_iounmap(p, 4); #endif + set_trap_gate(0, ÷_error); set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ -- cgit v1.1 From 574977a2edde0148ea365008dceb0c2594d10b11 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 1 Jul 2008 16:46:33 -0700 Subject: x86_64/setup: unconditionally populate the pgd When allocating a new pud, unconditionally populate the pgd (why did we bother to create a new pud if we weren't going to populate it?). This will only happen if the pgd slot was empty, since any existing pud will be reused. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b10b7f1..77d129d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -616,9 +616,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); unmap_low_page(pud); - if (!after_bootmem) - pgd_populate(&init_mm, pgd_offset_k(start), - __va(pud_phys)); + pgd_populate(&init_mm, pgd_offset_k(start), + __va(pud_phys)); } if (!after_bootmem) -- cgit v1.1 From 8490638cf0fb3975f7636c5268f27d5daf4eaaa5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 1 Jul 2008 16:46:35 -0700 Subject: x86: always set _PAGE_GLOBAL in _PAGE_KERNEL* flags Consistently set _PAGE_GLOBAL in _PAGE_KERNEL flags. This makes 32- and 64-bit code consistent, and removes some special cases where __PAGE_KERNEL* did not have _PAGE_GLOBAL set, causing confusion as a result of the inconsistencies. This patch only affects x86-64, which generally always supports PGD. The x86-32 patch is next. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/head_64.S | 4 ++-- include/asm-x86/pgtable.h | 32 +++++++++++++------------------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4336f98..b07ac7b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* @@ -387,7 +387,7 @@ NEXT_PAGE(level2_kernel_pgt) * If you want to increase this then increase MODULES_VADDR * too.) */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_spare_pgt) diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 85f851a..a1c0009 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -88,7 +88,7 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #endif /* __ASSEMBLY__ */ #else #define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) #endif @@ -103,24 +103,18 @@ extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) -#ifdef CONFIG_X86_32 -# define MAKE_GLOBAL(x) __pgprot((x)) -#else -# define MAKE_GLOBAL(x) __pgprot((x) | _PAGE_GLOBAL) -#endif - -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_WC MAKE_GLOBAL(__PAGE_KERNEL_WC) -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) -#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) /* xwr */ #define __P000 PAGE_NONE -- cgit v1.1 From ef5e94af16c0c82452e1ea5d387e1203dd2198d6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 1 Jul 2008 16:46:36 -0700 Subject: x86_32: remove __PAGE_KERNEL(_EXEC) From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Older x86-32 processors do not support global mappings (PGD), so must only use it if the processor supports it. The _PAGE_KERNEL* flags always have _PAGE_KERNEL set, since logically we always want it set. This is OK even on processors which do not support PGD, since all _PAGE flags are masked with __supported_pte_mask before being turned into a real in-pagetable pte. On 32-bit systems, __supported_pte_mask is initialized to not contain _PAGE_GLOBAL, and it is then added if the CPU is found to support it. The x86-32 code used to use __PAGE_KERNEL/__PAGE_KERNEL_EXEC for this purpose, but they're now redundant and can be removed. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_32.c | 10 ++-------- include/asm-x86/pgtable.h | 10 ---------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8efe872..b5a0fd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -383,11 +383,6 @@ static void __init set_highmem_pages_init(void) # define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ -pteval_t __PAGE_KERNEL = _PAGE_KERNEL; -EXPORT_SYMBOL(__PAGE_KERNEL); - -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; - void __init native_pagetable_setup_start(pgd_t *base) { unsigned long pfn, va; @@ -509,7 +504,7 @@ void zap_low_mappings(void) int nx_enabled; -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE @@ -796,8 +791,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* Enable PGE if available */ if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + __supported_pte_mask |= _PAGE_GLOBAL; } /* diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index a1c0009..64de07e 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -78,19 +78,9 @@ #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ _PAGE_ACCESSED) -#ifdef CONFIG_X86_32 -#define _PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX) - -#ifndef __ASSEMBLY__ -extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; -#endif /* __ASSEMBLY__ */ -#else #define __PAGE_KERNEL_EXEC \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) -#endif #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -- cgit v1.1 From 5a654ba7a88d90483d0f0586297ea9075d755fc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Tue, 1 Jul 2008 16:46:37 -0700 Subject: x86/cpa: use an undefined PTE bit for testing CPA Rather than using _PAGE_GLOBAL - which not all CPUs support - to test CPA, use one of the reserved-for-software-use PTE flags instead. This allows CPA testing to work on CPUs which don't support PGD. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Stephen Tweedie <sct@redhat.com> Cc: Eduardo Habkost <ehabkost@redhat.com> Cc: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pageattr-test.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 75f1b10..0dcd42e 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -1,8 +1,8 @@ /* * self test for change_page_attr. * - * Clears the global bit on random pages in the direct mapping, then reverts - * and compares page tables forwards and afterwards. + * Clears the a test pte bit on random pages in the direct mapping, + * then reverts and compares page tables forwards and afterwards. */ #include <linux/bootmem.h> #include <linux/kthread.h> @@ -32,6 +32,13 @@ enum { GPS = (1<<30) }; +#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) + +static int pte_testbit(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UNUSED1; +} + struct split_state { long lpg, gpg, spg, exec; long min_exec, max_exec; @@ -165,15 +172,14 @@ static int pageattr_test(void) continue; } - err = change_page_attr_clear(addr[i], len[i], - __pgprot(_PAGE_GLOBAL)); + err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; } pte = lookup_address(addr[i], &level); - if (!pte || pte_global(*pte) || pte_huge(*pte)) { + if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], pte ? (u64)pte_val(*pte) : 0ULL); failed++; @@ -198,14 +204,13 @@ static int pageattr_test(void) failed++; continue; } - err = change_page_attr_set(addr[i], len[i], - __pgprot(_PAGE_GLOBAL)); + err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; } pte = lookup_address(addr[i], &level); - if (!pte || !pte_global(*pte)) { + if (!pte || pte_testbit(*pte)) { printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", addr[i], pte ? (u64)pte_val(*pte) : 0ULL); failed++; -- cgit v1.1 From a29d1cfe9e9337aedeed505afddc8465ac709b87 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Mon, 2 Jun 2008 13:19:08 +0200 Subject: printk: export console_drivers this symbol is needed by drivers/video/xen-fbfront.ko. [ cherry-picked from tip/core/printk ] Signed-off-by: Ingo Molnar <mingo@elte.hu> --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/printk.c b/kernel/printk.c index 028ed75..1fb1382 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress); static DECLARE_MUTEX(console_sem); static DECLARE_MUTEX(secondary_console_sem); struct console *console_drivers; +EXPORT_SYMBOL_GPL(console_drivers); + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's -- cgit v1.1 From 698839fe04ae11f228074ad45614343c3921a6c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove have_arch_parse_srat -v2 we already have the same srat handling interface for 32bit. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 4 ---- arch/x86/mm/discontig_32.c | 17 ----------------- 2 files changed, 21 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67e5275..d8dddbc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -460,10 +460,6 @@ config ACPI_SRAT depends on X86_32 && ACPI && NUMA && X86_GENERICARCH select ACPI_NUMA -config HAVE_ARCH_PARSE_SRAT - def_bool y - depends on ACPI_SRAT - config X86_SUMMIT_NUMA def_bool y depends on X86_32 && NUMA && X86_GENERICARCH diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index fa389d2..5dfef9f 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -443,20 +443,3 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) -/* - * Dummy on 32-bit, for now: - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ -} - -void __init acpi_numa_arch_fixup(void) -{ -} -#endif -- cgit v1.1 From 6247943d8ab699b57653afd453a4940cca70ef8a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove acpi_srat config v2 use ACPI_NUMA directly and move srat_32.c to mm/ Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 5 - arch/x86/kernel/Makefile | 1 - arch/x86/kernel/srat_32.c | 280 ---------------------------------------------- arch/x86/mm/Makefile | 3 +- arch/x86/mm/srat_32.c | 280 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/srat.h | 2 +- 6 files changed, 283 insertions(+), 288 deletions(-) delete mode 100644 arch/x86/kernel/srat_32.c create mode 100644 arch/x86/mm/srat_32.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d8dddbc..bb0c0d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -455,11 +455,6 @@ config MEMTEST memtest=4, mean do 4 test patterns. If you are unsure how to answer this question, answer Y. -config ACPI_SRAT - def_bool y - depends on X86_32 && ACPI && NUMA && X86_GENERICARCH - select ACPI_NUMA - config X86_SUMMIT_NUMA def_bool y depends on X86_32 && NUMA && X86_GENERICARCH diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 212804e..54829e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -64,7 +64,6 @@ obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o obj-y += vsmp_64.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module_$(BITS).o -obj-$(CONFIG_ACPI_SRAT) += srat_32.o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c deleted file mode 100644 index f41d67f..0000000 --- a/arch/x86/kernel/srat_32.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ -#include <linux/mm.h> -#include <linux/bootmem.h> -#include <linux/mmzone.h> -#include <linux/acpi.h> -#include <linux/nodemask.h> -#include <asm/srat.h> -#include <asm/topology.h> -#include <asm/smp.h> -#include <asm/e820.h> - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE 3 -#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { - unsigned long start_pfn; - unsigned long end_pfn; - u8 pxm; // proximity domain of node - u8 nid; // which cnode contains this chunk? - u8 bank; // which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_APICID]; - -int numa_off __initdata; -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ - if (srat_disabled()) - return; - if (cpu_affinity->header.length != - sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - - if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; /* empty entry */ - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - - apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - - printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", - cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ - unsigned long long paddr, size; - unsigned long start_pfn, end_pfn; - u8 pxm; - struct node_memory_chunk_s *p, *q, *pend; - - if (srat_disabled()) - return; - if (memory_affinity->header.length != - sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - - if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; /* empty entry */ - - pxm = memory_affinity->proximity_domain & 0xff; - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, pxm); - - /* calculate info for memory chunk structure */ - paddr = memory_affinity->base_address; - size = memory_affinity->length; - - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; - - - if (num_memory_chunks >= MAXCHUNKS) { - printk(KERN_WARNING "Too many mem chunks in SRAT." - " Ignoring %lld MBytes at %llx\n", - size/(1024*1024), paddr); - return; - } - - /* Insertion sort based on base address */ - pend = &node_memory_chunk[num_memory_chunks]; - for (p = &node_memory_chunk[0]; p < pend; p++) { - if (start_pfn < p->start_pfn) - break; - } - if (p < pend) { - for (q = pend; q >= p; q--) - *(q + 1) = *q; - } - p->start_pfn = start_pfn; - p->end_pfn = end_pfn; - p->pxm = pxm; - - num_memory_chunks++; - - printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" - " in proximity domain %02x %s\n", - start_pfn, end_pfn, - memory_affinity->memory_type, - pxm, - ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ - /* - * Only add present memory as told by the e820. - * There is no guarantee from the SRAT that the memory it - * enumerates is present at boot time because it represents - * *possible* memory hotplug areas the same as normal RAM. - */ - if (memory_chunk->start_pfn >= max_pfn) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - memory_chunk->start_pfn, memory_chunk->end_pfn); - return; - } - if (memory_chunk->nid != nid) - return; - - if (!node_has_online_mem(nid)) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_start_pfn[nid] > memory_chunk->start_pfn) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_end_pfn[nid] < memory_chunk->end_pfn) - node_end_pfn[nid] = memory_chunk->end_pfn; -} - -int __init get_memcfg_from_srat(void) -{ - int i, j, nid; - - - if (srat_disabled()) - goto out_fail; - - if (num_memory_chunks == 0) { - printk(KERN_WARNING - "could not finy any ACPI SRAT memory areas.\n"); - goto out_fail; - } - - /* Calculate total number of nodes in system from PXM bitmap and create - * a set of sequential node IDs starting at zero. (ACPI doesn't seem - * to specify the range of _PXM values.) - */ - /* - * MCD - we no longer HAVE to number nodes sequentially. PXM domain - * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically - * 32, so we will continue numbering them in this manner until MAX_NUMNODES - * approaches MAX_PXM_DOMAINS for i386. - */ - nodes_clear(node_online_map); - for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { - int nid = acpi_map_pxm_to_node(i); - node_set_online(nid); - } - } - BUG_ON(num_online_nodes() == 0); - - /* set cnode id in memory chunk structure */ - for (i = 0; i < num_memory_chunks; i++) - node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - - printk(KERN_DEBUG "pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk(KERN_CONT "%02x ", pxm_bitmap[i]); - } - printk(KERN_CONT "\n"); - printk(KERN_DEBUG "Number of logical nodes in system = %d\n", - num_online_nodes()); - printk(KERN_DEBUG "Number of memory chunks in system = %d\n", - num_memory_chunks); - - for (i = 0; i < MAX_APICID; i++) - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); - - for (j = 0; j < num_memory_chunks; j++){ - struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk(KERN_DEBUG - "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", - j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - node_read_chunk(chunk->nid, chunk); - e820_register_active_regions(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); - } - - for_each_online_node(nid) { - unsigned long start = node_start_pfn[nid]; - unsigned long end = min(node_end_pfn[nid], max_pfn); - - memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); - } - return 1; -out_fail: - printk(KERN_ERR "failed to get NUMA memory information from SRAT" - " table\n"); - return 0; -} diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index b7b3e4c..c107641 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,5 +13,6 @@ obj-$(CONFIG_NUMA) += discontig_32.o else obj-$(CONFIG_NUMA) += numa_64.o obj-$(CONFIG_K8_NUMA) += k8topology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_64.o endif +obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o + diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c new file mode 100644 index 0000000..f41d67f --- /dev/null +++ b/arch/x86/mm/srat_32.c @@ -0,0 +1,280 @@ +/* + * Some of the code in this file has been gleaned from the 64 bit + * discontigmem support code base. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to Pat Gaughen <gone@us.ibm.com> + */ +#include <linux/mm.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/acpi.h> +#include <linux/nodemask.h> +#include <asm/srat.h> +#include <asm/topology.h> +#include <asm/smp.h> +#include <asm/e820.h> + +/* + * proximity macros and definitions + */ +#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ +#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ +#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) +#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) +/* bitmap length; _PXM is at most 255 */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ + +#define MAX_CHUNKS_PER_NODE 3 +#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) +struct node_memory_chunk_s { + unsigned long start_pfn; + unsigned long end_pfn; + u8 pxm; // proximity domain of node + u8 nid; // which cnode contains this chunk? + u8 bank; // which mem bank on this node +}; +static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; + +static int __initdata num_memory_chunks; /* total number of memory chunks */ +static u8 __initdata apicid_to_pxm[MAX_APICID]; + +int numa_off __initdata; +int acpi_numa __initdata; + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + num_memory_chunks = 0; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* Identify CPU proximity domains */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) +{ + if (srat_disabled()) + return; + if (cpu_affinity->header.length != + sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + + if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + + apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; + + printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", + cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); +} + +/* + * Identify memory proximity domains and hot-remove capabilities. + * Fill node memory chunk list structure. + */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) +{ + unsigned long long paddr, size; + unsigned long start_pfn, end_pfn; + u8 pxm; + struct node_memory_chunk_s *p, *q, *pend; + + if (srat_disabled()) + return; + if (memory_affinity->header.length != + sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + + if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; /* empty entry */ + + pxm = memory_affinity->proximity_domain & 0xff; + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, pxm); + + /* calculate info for memory chunk structure */ + paddr = memory_affinity->base_address; + size = memory_affinity->length; + + start_pfn = paddr >> PAGE_SHIFT; + end_pfn = (paddr + size) >> PAGE_SHIFT; + + + if (num_memory_chunks >= MAXCHUNKS) { + printk(KERN_WARNING "Too many mem chunks in SRAT." + " Ignoring %lld MBytes at %llx\n", + size/(1024*1024), paddr); + return; + } + + /* Insertion sort based on base address */ + pend = &node_memory_chunk[num_memory_chunks]; + for (p = &node_memory_chunk[0]; p < pend; p++) { + if (start_pfn < p->start_pfn) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_pfn = start_pfn; + p->end_pfn = end_pfn; + p->pxm = pxm; + + num_memory_chunks++; + + printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" + " in proximity domain %02x %s\n", + start_pfn, end_pfn, + memory_affinity->memory_type, + pxm, + ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? + "enabled and removable" : "enabled" ) ); +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} +/* + * The SRAT table always lists ascending addresses, so can always + * assume that the first "start" address that you see is the real + * start of the node, and that the current "end" address is after + * the previous one. + */ +static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +{ + /* + * Only add present memory as told by the e820. + * There is no guarantee from the SRAT that the memory it + * enumerates is present at boot time because it represents + * *possible* memory hotplug areas the same as normal RAM. + */ + if (memory_chunk->start_pfn >= max_pfn) { + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", + memory_chunk->start_pfn, memory_chunk->end_pfn); + return; + } + if (memory_chunk->nid != nid) + return; + + if (!node_has_online_mem(nid)) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_start_pfn[nid] > memory_chunk->start_pfn) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_end_pfn[nid] < memory_chunk->end_pfn) + node_end_pfn[nid] = memory_chunk->end_pfn; +} + +int __init get_memcfg_from_srat(void) +{ + int i, j, nid; + + + if (srat_disabled()) + goto out_fail; + + if (num_memory_chunks == 0) { + printk(KERN_WARNING + "could not finy any ACPI SRAT memory areas.\n"); + goto out_fail; + } + + /* Calculate total number of nodes in system from PXM bitmap and create + * a set of sequential node IDs starting at zero. (ACPI doesn't seem + * to specify the range of _PXM values.) + */ + /* + * MCD - we no longer HAVE to number nodes sequentially. PXM domain + * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically + * 32, so we will continue numbering them in this manner until MAX_NUMNODES + * approaches MAX_PXM_DOMAINS for i386. + */ + nodes_clear(node_online_map); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (BMAP_TEST(pxm_bitmap, i)) { + int nid = acpi_map_pxm_to_node(i); + node_set_online(nid); + } + } + BUG_ON(num_online_nodes() == 0); + + /* set cnode id in memory chunk structure */ + for (i = 0; i < num_memory_chunks; i++) + node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); + + printk(KERN_DEBUG "pxm bitmap: "); + for (i = 0; i < sizeof(pxm_bitmap); i++) { + printk(KERN_CONT "%02x ", pxm_bitmap[i]); + } + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "Number of logical nodes in system = %d\n", + num_online_nodes()); + printk(KERN_DEBUG "Number of memory chunks in system = %d\n", + num_memory_chunks); + + for (i = 0; i < MAX_APICID; i++) + apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + + for (j = 0; j < num_memory_chunks; j++){ + struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; + printk(KERN_DEBUG + "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", + j, chunk->nid, chunk->start_pfn, chunk->end_pfn); + node_read_chunk(chunk->nid, chunk); + e820_register_active_regions(chunk->nid, chunk->start_pfn, + min(chunk->end_pfn, max_pfn)); + } + + for_each_online_node(nid) { + unsigned long start = node_start_pfn[nid]; + unsigned long end = min(node_end_pfn[nid], max_pfn); + + memory_present(nid, start, end); + node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); + } + return 1; +out_fail: + printk(KERN_ERR "failed to get NUMA memory information from SRAT" + " table\n"); + return 0; +} diff --git a/include/asm-x86/srat.h b/include/asm-x86/srat.h index 456fe0b..774c919 100644 --- a/include/asm-x86/srat.h +++ b/include/asm-x86/srat.h @@ -27,7 +27,7 @@ #ifndef _ASM_SRAT_H_ #define _ASM_SRAT_H_ -#ifdef CONFIG_ACPI_SRAT +#ifdef CONFIG_ACPI_NUMA extern int get_memcfg_from_srat(void); #else static inline int get_memcfg_from_srat(void) -- cgit v1.1 From 69ac9cd629ca96e59f34eb4ccd12d00b2c8276a7 Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Fri, 27 Jun 2008 13:12:54 +0200 Subject: sysfs: add /sys/firmware/memmap This patch adds /sys/firmware/memmap interface that represents the BIOS (or Firmware) provided memory map. The tree looks like: /sys/firmware/memmap/0/start (hex number) end (hex number) type (string) ... /1/start end type With the following shell snippet one can print the memory map in the same form the kernel prints itself when booting on x86 (the E820 map). --------- 8< -------------------------- #!/bin/sh cd /sys/firmware/memmap for dir in * ; do start=$(cat $dir/start) end=$(cat $dir/end) type=$(cat $dir/type) printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type" done --------- >8 -------------------------- That patch only provides the needed interface: 1. The sysfs interface. 2. The structure and enumeration definition. 3. The function firmware_map_add() and firmware_map_add_early() that should be called from architecture code (E820/EFI, for example) to add the contents to the interface. If the kernel is compiled without CONFIG_FIRMWARE_MEMMAP, the interface does nothing without cluttering the architecture-specific code with #ifdef's. The purpose of the new interface is kexec: While /proc/iomem represents the *used* memory map (e.g. modified via kernel parameters like 'memmap' and 'mem'), the /sys/firmware/memmap tree represents the unmodified memory map provided via the firmware. So kexec can: - use the original memory map for rebooting, - use the /proc/iomem for setting up the ELF core headers for kdump case that should only represent the memory of the system. The patch has been tested on i386 and x86_64. Signed-off-by: Bernhard Walle <bwalle@suse.de> Acked-by: Greg KH <gregkh@suse.de> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: kexec@lists.infradead.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- Documentation/ABI/testing/sysfs-firmware-memmap | 71 ++++++++ drivers/firmware/Kconfig | 10 ++ drivers/firmware/Makefile | 1 + drivers/firmware/memmap.c | 205 ++++++++++++++++++++++++ include/linux/firmware-map.h | 74 +++++++++ 5 files changed, 361 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-memmap create mode 100644 drivers/firmware/memmap.c create mode 100644 include/linux/firmware-map.h diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap new file mode 100644 index 0000000..0d99ee6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-memmap @@ -0,0 +1,71 @@ +What: /sys/firmware/memmap/ +Date: June 2008 +Contact: Bernhard Walle <bwalle@suse.de> +Description: + On all platforms, the firmware provides a memory map which the + kernel reads. The resources from that memory map are registered + in the kernel resource tree and exposed to userspace via + /proc/iomem (together with other resources). + + However, on most architectures that firmware-provided memory + map is modified afterwards by the kernel itself, either because + the kernel merges that memory map with other information or + just because the user overwrites that memory map via command + line. + + kexec needs the raw firmware-provided memory map to setup the + parameter segment of the kernel that should be booted with + kexec. Also, the raw memory map is useful for debugging. For + that reason, /sys/firmware/memmap is an interface that provides + the raw memory map to userspace. + + The structure is as follows: Under /sys/firmware/memmap there + are subdirectories with the number of the entry as their name: + + /sys/firmware/memmap/0 + /sys/firmware/memmap/1 + /sys/firmware/memmap/2 + /sys/firmware/memmap/3 + ... + + The maximum depends on the number of memory map entries provided + by the firmware. The order is just the order that the firmware + provides. + + Each directory contains three files: + + start : The start address (as hexadecimal number with the + '0x' prefix). + end : The end address, inclusive (regardless whether the + firmware provides inclusive or exclusive ranges). + type : Type of the entry as string. See below for a list of + valid types. + + So, for example: + + /sys/firmware/memmap/0/start + /sys/firmware/memmap/0/end + /sys/firmware/memmap/0/type + /sys/firmware/memmap/1/start + ... + + Currently following types exist: + + - System RAM + - ACPI Tables + - ACPI Non-volatile Storage + - reserved + + Following shell snippet can be used to display that memory + map in a human-readable format: + + -------------------- 8< ---------------------------------------- + #!/bin/bash + cd /sys/firmware/memmap + for dir in * ; do + start=$(cat $dir/start) + end=$(cat $dir/end) + type=$(cat $dir/type) + printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type" + done + -------------------- >8 ---------------------------------------- diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index dc2cec6..ebb9e51 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -26,6 +26,16 @@ config EDD_OFF kernel. Say N if you want EDD enabled by default. EDD can be dynamically set using the kernel parameter 'edd={on|skipmbr|off}'. +config FIRMWARE_MEMMAP + bool "Add firmware-provided memory map to sysfs" if EMBEDDED + default (X86_64 || X86_32) + help + Add the firmware-provided (unmodified) memory map to /sys/firmware/memmap. + That memory map is used for example by kexec to set up parameter area + for the next kernel, but can also be used for debugging purposes. + + See also Documentation/ABI/testing/sysfs-firmware-memmap. + config EFI_VARS tristate "EFI Variable Support via sysfs" depends on EFI diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index 4c91471..1c3c173 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_DCDBAS) += dcdbas.o obj-$(CONFIG_DMIID) += dmi-id.o obj-$(CONFIG_ISCSI_IBFT_FIND) += iscsi_ibft_find.o obj-$(CONFIG_ISCSI_IBFT) += iscsi_ibft.o +obj-$(CONFIG_FIRMWARE_MEMMAP) += memmap.o diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c new file mode 100644 index 0000000..e23399c --- /dev/null +++ b/drivers/firmware/memmap.c @@ -0,0 +1,205 @@ +/* + * linux/drivers/firmware/memmap.c + * Copyright (C) 2008 SUSE LINUX Products GmbH + * by Bernhard Walle <bwalle@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/string.h> +#include <linux/firmware-map.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/bootmem.h> + +/* + * Data types ------------------------------------------------------------------ + */ + +/* + * Firmware map entry. Because firmware memory maps are flat and not + * hierarchical, it's ok to organise them in a linked list. No parent + * information is necessary as for the resource tree. + */ +struct firmware_map_entry { + resource_size_t start; /* start of the memory range */ + resource_size_t end; /* end of the memory range (incl.) */ + const char *type; /* type of the memory range */ + struct list_head list; /* entry for the linked list */ + struct kobject kobj; /* kobject for each entry */ +}; + +/* + * Forward declarations -------------------------------------------------------- + */ +static ssize_t memmap_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf); +static ssize_t start_show(struct firmware_map_entry *entry, char *buf); +static ssize_t end_show(struct firmware_map_entry *entry, char *buf); +static ssize_t type_show(struct firmware_map_entry *entry, char *buf); + +/* + * Static data ----------------------------------------------------------------- + */ + +struct memmap_attribute { + struct attribute attr; + ssize_t (*show)(struct firmware_map_entry *entry, char *buf); +}; + +struct memmap_attribute memmap_start_attr = __ATTR_RO(start); +struct memmap_attribute memmap_end_attr = __ATTR_RO(end); +struct memmap_attribute memmap_type_attr = __ATTR_RO(type); + +/* + * These are default attributes that are added for every memmap entry. + */ +static struct attribute *def_attrs[] = { + &memmap_start_attr.attr, + &memmap_end_attr.attr, + &memmap_type_attr.attr, + NULL +}; + +static struct sysfs_ops memmap_attr_ops = { + .show = memmap_attr_show, +}; + +static struct kobj_type memmap_ktype = { + .sysfs_ops = &memmap_attr_ops, + .default_attrs = def_attrs, +}; + +/* + * Registration functions ------------------------------------------------------ + */ + +/* + * Firmware memory map entries + */ +static LIST_HEAD(map_entries); + +/** + * Common implementation of firmware_map_add() and firmware_map_add_early() + * which expects a pre-allocated struct firmware_map_entry. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * @entry: Pre-allocated (either kmalloc() or bootmem allocator), uninitialised + * entry. + */ +static int firmware_map_add_entry(resource_size_t start, resource_size_t end, + const char *type, + struct firmware_map_entry *entry) +{ + BUG_ON(start > end); + + entry->start = start; + entry->end = end; + entry->type = type; + INIT_LIST_HEAD(&entry->list); + kobject_init(&entry->kobj, &memmap_ktype); + + list_add_tail(&entry->list, &map_entries); + + return 0; +} + +/* + * See <linux/firmware-map.h> for documentation. + */ +int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type) +{ + struct firmware_map_entry *entry; + + entry = kmalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); + WARN_ON(!entry); + if (!entry) + return -ENOMEM; + + return firmware_map_add_entry(start, end, type, entry); +} + +/* + * See <linux/firmware-map.h> for documentation. + */ +int __init firmware_map_add_early(resource_size_t start, resource_size_t end, + const char *type) +{ + struct firmware_map_entry *entry; + + entry = alloc_bootmem_low(sizeof(struct firmware_map_entry)); + WARN_ON(!entry); + if (!entry) + return -ENOMEM; + + return firmware_map_add_entry(start, end, type, entry); +} + +/* + * Sysfs functions ------------------------------------------------------------- + */ + +static ssize_t start_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->start); +} + +static ssize_t end_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->end); +} + +static ssize_t type_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); +} + +#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr) +#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj) + +static ssize_t memmap_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct firmware_map_entry *entry = to_memmap_entry(kobj); + struct memmap_attribute *memmap_attr = to_memmap_attr(attr); + + return memmap_attr->show(entry, buf); +} + +/* + * Initialises stuff and adds the entries in the map_entries list to + * sysfs. Important is that firmware_map_add() and firmware_map_add_early() + * must be called before late_initcall. + */ +static int __init memmap_init(void) +{ + int i = 0; + struct firmware_map_entry *entry; + struct kset *memmap_kset; + + memmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj); + WARN_ON(!memmap_kset); + if (!memmap_kset) + return -ENOMEM; + + list_for_each_entry(entry, &map_entries, list) { + entry->kobj.kset = memmap_kset; + kobject_add(&entry->kobj, NULL, "%d", i++); + } + + return 0; +} +late_initcall(memmap_init); + diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h new file mode 100644 index 0000000..acbdbcc --- /dev/null +++ b/include/linux/firmware-map.h @@ -0,0 +1,74 @@ +/* + * include/linux/firmware-map.h: + * Copyright (C) 2008 SUSE LINUX Products GmbH + * by Bernhard Walle <bwalle@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef _LINUX_FIRMWARE_MAP_H +#define _LINUX_FIRMWARE_MAP_H + +#include <linux/list.h> +#include <linux/kobject.h> + +/* + * provide a dummy interface if CONFIG_FIRMWARE_MEMMAP is disabled + */ +#ifdef CONFIG_FIRMWARE_MEMMAP + +/** + * Adds a firmware mapping entry. This function uses kmalloc() for memory + * allocation. Use firmware_map_add_early() if you want to use the bootmem + * allocator. + * + * That function must be called before late_initcall. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * + * Returns 0 on success, or -ENOMEM if no memory could be allocated. + */ +int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type); + +/** + * Adds a firmware mapping entry. This function uses the bootmem allocator + * for memory allocation. Use firmware_map_add() if you want to use kmalloc(). + * + * That function must be called before late_initcall. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * + * Returns 0 on success, or -ENOMEM if no memory could be allocated. + */ +int firmware_map_add_early(resource_size_t start, resource_size_t end, + const char *type); + +#else /* CONFIG_FIRMWARE_MEMMAP */ + +static inline int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type) +{ + return 0; +} + +static inline int firmware_map_add_early(resource_size_t start, + resource_size_t end, const char *type) +{ + return 0; +} + +#endif /* CONFIG_FIRMWARE_MEMMAP */ + +#endif /* _LINUX_FIRMWARE_MAP_H */ -- cgit v1.1 From 5dfcf14d5b28174f94cbe9b4fb35d415db61c64a Mon Sep 17 00:00:00 2001 From: Bernhard Walle <bwalle@suse.de> Date: Fri, 27 Jun 2008 13:12:55 +0200 Subject: x86: use FIRMWARE_MEMMAP on x86/E820 This patch uses the /sys/firmware/memmap interface provided in the last patch on the x86 architecture when E820 is used. The patch copies the E820 memory map very early, and registers the E820 map afterwards via firmware_map_add_early(). Signed-off-by: Bernhard Walle <bwalle@suse.de> Acked-by: Greg KH <gregkh@suse.de> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: kexec@lists.infradead.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 44 +++++++++++++++++++++++++++++++++++++------- include/asm-x86/e820.h | 2 ++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d033585..fc1d579 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/firmware-map.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -27,7 +28,22 @@ #include <asm/setup.h> #include <asm/trampoline.h> +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ struct e820map e820; +struct e820map e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -1198,6 +1214,17 @@ void __init finish_e820_parsing(void) } } +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + default: return "reserved"; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -1209,13 +1236,6 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { - switch (e820.map[i].type) { - case E820_RESERVED_KERN: - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } end = e820.map[i].addr + e820.map[i].size - 1; #ifndef CONFIG_RESOURCES_64BIT if (end > 0x100000000ULL) { @@ -1223,6 +1243,7 @@ void __init e820_reserve_resources(void) continue; } #endif + res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; @@ -1230,6 +1251,13 @@ void __init e820_reserve_resources(void) insert_resource(&iomem_resource, res); res++; } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size - 1, + e820_type_to_string(entry->type)); + } } char *__init default_machine_specific_memory_setup(void) @@ -1266,6 +1294,8 @@ char *__init default_machine_specific_memory_setup(void) e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } + memcpy(&e820_saved, &e820, sizeof(struct e820map)); + /* In case someone cares... */ return who; } diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 4b26604..a20d0a7f 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -59,7 +59,9 @@ struct e820map { struct e820entry map[E820_X_MAX]; }; +/* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; +extern struct e820map e820_saved; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); -- cgit v1.1 From 0be15526beb4c228e0477221c62ec8ab0fc7440f Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 3 Jul 2008 11:35:37 -0700 Subject: x86: move saving e820_saved to setup_memory_map so other path that will override memory_setup or machine_specific_memory_setup could have e820_saved too. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index fc1d579..13e3298 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1294,8 +1294,6 @@ char *__init default_machine_specific_memory_setup(void) e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } - memcpy(&e820_saved, &e820, sizeof(struct e820map)); - /* In case someone cares... */ return who; } @@ -1313,8 +1311,12 @@ char * __init __attribute__((weak)) memory_setup(void) void __init setup_memory_map(void) { + char *who; + + who = memory_setup(); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(memory_setup()); + e820_print_map(who); } #ifdef CONFIG_X86_64 -- cgit v1.1 From a0a0becd2da0ba0d7f0204a61d1905926cdb163d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 3 Jul 2008 11:37:13 -0700 Subject: x86: make e820_saved have update from setup_data seperate reserve_setup_data into e820_reserved_setup_data, and reserve_early_setup_data. So could use e820_reserved_setup_data to backup e820 with setup_data. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Bernhard Walle <bwalle@suse.de> Cc: Ying Huang <ying.huang@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/setup.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cfcfbef..bea8ae7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -394,11 +394,10 @@ static void __init parse_setup_data(void) } } -static void __init reserve_setup_data(void) +static void __init e820_reserve_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; int found = 0; if (boot_params.hdr.version < 0x0209) @@ -406,8 +405,6 @@ static void __init reserve_setup_data(void) pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_ioremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -418,10 +415,29 @@ static void __init reserve_setup_data(void) return; sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } +static void __init reserve_early_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} + /* * --------- Crashkernel reservation ------------------------------ */ @@ -626,6 +642,8 @@ void __init setup_arch(char **cmdline_p) setup_memory_map(); parse_setup_data(); + /* update the e820_saved too */ + e820_reserve_setup_data(); copy_edd(); @@ -656,7 +674,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); /* after early param, so could get panic from serial */ - reserve_setup_data(); + reserve_early_setup_data(); if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.1 From fc9036ea1a4b14229788e6df3936b451a6abac98 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 3 Jul 2008 11:39:00 -0700 Subject: x86: let early_reserve_e820 update e820_saved too so when it is called after early_param, e820_saved get updated too. esp for mpc update. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: Bernhard Walle <bwalle@suse.de> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 13e3298..e07d401 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -414,8 +414,9 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map) return __append_e820_map(biosmap, nr_map); } -u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, - unsigned new_type) +static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) { int i; u64 real_updated_size = 0; @@ -426,7 +427,7 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + struct e820entry *ei = &e820x->map[i]; u64 final_start, final_end; if (ei->type != old_type) continue; @@ -454,6 +455,19 @@ u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, return real_updated_size; } +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return e820_update_range_map(&e820, start, size, old_type, new_type); +} + +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ + return e820_update_range_map(&e820_saved, start, size, old_type, + new_type); +} + /* make e820 not cover the range */ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) @@ -503,6 +517,15 @@ void __init update_e820(void) printk(KERN_INFO "modified physical RAM map:\n"); e820_print_map("modified"); } +static void __init update_e820_saved(void) +{ + int nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} #define MAX_GAP_END 0x100000000ull /* * Search for a gap in the e820 memory space from start_addr to end_addr. @@ -1007,8 +1030,10 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) addr = round_down(start + size - sizet, align); e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); printk(KERN_INFO "update e820 for early_reserve_e820\n"); update_e820(); + update_e820_saved(); return addr; } -- cgit v1.1 From 3a9e189d69479736a0d0901c87ad08c9e328b389 Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Tue, 1 Jul 2008 14:45:32 -0500 Subject: x86: map UV chipset space - pagetable Add boot-time function for creating additional 2MB page table entries for mapping chipset specific cached/uncached ranges. Signed-off-by: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/asm-x86/page_64.h | 4 ++++ include/asm-x86/pgtable.h | 2 ++ 3 files changed, 46 insertions(+) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 77d129d..57d5eff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -202,6 +202,46 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) } /* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} + +/* * The head.S code sets up the kernel high mapping: * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index 010d12d..c6916c8 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -91,6 +91,10 @@ extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); + +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 64de07e..49cbd76 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -91,6 +91,7 @@ #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) @@ -102,6 +103,7 @@ #define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) #define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) -- cgit v1.1 From 83f5d894ca5280bfcd904dfeb1347c2da2b19aac Mon Sep 17 00:00:00 2001 From: Jack Steiner <steiner@sgi.com> Date: Tue, 1 Jul 2008 14:45:38 -0500 Subject: x86: map UV chipset space - UV support Create page table entries to map the SGI UV chipset GRU. local MMR & global MMR ranges. Signed-off-by: Jack Steiner <steiner@sgi.com> Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/genx2apic_uv_x.c | 75 +++++++++++++++++++++++++++++++++++++++- include/asm-x86/uv/uv_hub.h | 2 ++ include/asm-x86/uv/uv_mmrs.h | 46 ++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 45e84ac..711f11c 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -8,6 +8,7 @@ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. */ +#include <linux/kernel.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -20,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -208,14 +210,79 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) BUG(); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +enum map_type {map_wb, map_uc}; + +static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << shift; + bytes = (1UL << shift); + printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, + paddr + bytes); + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); + +} +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); + if (gru.s.enable) + map_high("GRU", gru.s.base, shift, map_wb); +} + +static __init void map_config_high(int max_pnode) +{ + union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; + int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; + + cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); + if (cfg.s.enable) + map_high("CONFIG", cfg.s.base, shift, map_uc); +} + +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, map_uc); +} + +static __init void map_mmioh_high(int max_pnode) +{ + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; + int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + if (mmioh.s.enable) + map_high("MMIOH", mmioh.s.base, shift, map_uc); +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int max_pnode = 0; unsigned long mmr_base, present; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; @@ -281,12 +348,18 @@ static __init void uv_system_init(void) uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; + max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, " + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " "lcpu %d, blade %d\n", cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, lcpu, blade); } + + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_config_high(max_pnode); + map_mmioh_high(max_pnode); } /* diff --git a/include/asm-x86/uv/uv_hub.h b/include/asm-x86/uv/uv_hub.h index 6500488..a4ef26e 100644 --- a/include/asm-x86/uv/uv_hub.h +++ b/include/asm-x86/uv/uv_hub.h @@ -149,6 +149,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) +#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) +#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 #define UV_GLOBAL_MMR64_PNODE_SHIFT 26 diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index ac98460..37113f5 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -713,6 +713,26 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL + +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_cfg_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_cfg_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ + unsigned long rsvd_46_62: 17; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL @@ -740,6 +760,32 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_mmioh_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_29: 30; /* */ + unsigned long base : 16; /* RW */ + unsigned long m_io : 6; /* RW */ + unsigned long n_io : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL -- cgit v1.1 From 5d061e397db1ee7a62783a881833f3f9b89f6dc8 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich <sivanich@sgi.com> Date: Wed, 2 Jul 2008 15:39:35 -0500 Subject: x86, uv: update x86 mmr list for SGI uv This patch updates the X86 mmr list for SGI uv. Signed-off-by: Dimitri Sivanich <sivanich@sgi.com> Cc: Jack Steiner <steiner@sgi.com> Cc: Russ Anderson <rja@sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/uv/uv_mmrs.h | 535 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 478 insertions(+), 57 deletions(-) diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index 37113f5..eb59201 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -17,7 +17,7 @@ /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x0450 +#define UVH_BAU_DATA_CONFIG_32 0x0438 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 #define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL @@ -53,10 +53,248 @@ union uvh_bau_data_config_u { }; /* ========================================================================= */ +/* UVH_EVENT_OCCURRED0 */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED0 0x70000UL +#define UVH_EVENT_OCCURRED0_32 0x005e8 + +#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 +#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL +#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 +#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL +#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 +#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL +#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 +#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL +#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 +#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL +#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 +#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL +#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 +#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL +#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 +#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL +#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 +#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL +#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 +#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL +#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL +#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 +#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL +#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 +#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL +#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 +#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL +#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 +#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL +#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 +#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL +#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 +#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL +#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 +#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL +#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 +#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL +#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 +#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL +#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 +#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL +#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 +#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 +#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL +#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 +#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL +#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 +#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 +#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 +#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL +#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 +#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL +#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 +#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 +#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 +#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL +#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 +#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL +#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 +#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL +#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 +#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL +#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 +#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL +#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 +#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL +#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 +#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL +#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 +#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL +#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 +#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL +union uvh_event_occurred0_u { + unsigned long v; + struct uvh_event_occurred0_s { + unsigned long lb_hcerr : 1; /* RW, W1C */ + unsigned long gr0_hcerr : 1; /* RW, W1C */ + unsigned long gr1_hcerr : 1; /* RW, W1C */ + unsigned long lh_hcerr : 1; /* RW, W1C */ + unsigned long rh_hcerr : 1; /* RW, W1C */ + unsigned long xn_hcerr : 1; /* RW, W1C */ + unsigned long si_hcerr : 1; /* RW, W1C */ + unsigned long lb_aoerr0 : 1; /* RW, W1C */ + unsigned long gr0_aoerr0 : 1; /* RW, W1C */ + unsigned long gr1_aoerr0 : 1; /* RW, W1C */ + unsigned long lh_aoerr0 : 1; /* RW, W1C */ + unsigned long rh_aoerr0 : 1; /* RW, W1C */ + unsigned long xn_aoerr0 : 1; /* RW, W1C */ + unsigned long si_aoerr0 : 1; /* RW, W1C */ + unsigned long lb_aoerr1 : 1; /* RW, W1C */ + unsigned long gr0_aoerr1 : 1; /* RW, W1C */ + unsigned long gr1_aoerr1 : 1; /* RW, W1C */ + unsigned long lh_aoerr1 : 1; /* RW, W1C */ + unsigned long rh_aoerr1 : 1; /* RW, W1C */ + unsigned long xn_aoerr1 : 1; /* RW, W1C */ + unsigned long si_aoerr1 : 1; /* RW, W1C */ + unsigned long rh_vpi_int : 1; /* RW, W1C */ + unsigned long system_shutdown_int : 1; /* RW, W1C */ + unsigned long lb_irq_int_0 : 1; /* RW, W1C */ + unsigned long lb_irq_int_1 : 1; /* RW, W1C */ + unsigned long lb_irq_int_2 : 1; /* RW, W1C */ + unsigned long lb_irq_int_3 : 1; /* RW, W1C */ + unsigned long lb_irq_int_4 : 1; /* RW, W1C */ + unsigned long lb_irq_int_5 : 1; /* RW, W1C */ + unsigned long lb_irq_int_6 : 1; /* RW, W1C */ + unsigned long lb_irq_int_7 : 1; /* RW, W1C */ + unsigned long lb_irq_int_8 : 1; /* RW, W1C */ + unsigned long lb_irq_int_9 : 1; /* RW, W1C */ + unsigned long lb_irq_int_10 : 1; /* RW, W1C */ + unsigned long lb_irq_int_11 : 1; /* RW, W1C */ + unsigned long lb_irq_int_12 : 1; /* RW, W1C */ + unsigned long lb_irq_int_13 : 1; /* RW, W1C */ + unsigned long lb_irq_int_14 : 1; /* RW, W1C */ + unsigned long lb_irq_int_15 : 1; /* RW, W1C */ + unsigned long l1_nmi_int : 1; /* RW, W1C */ + unsigned long stop_clock : 1; /* RW, W1C */ + unsigned long asic_to_l1 : 1; /* RW, W1C */ + unsigned long l1_to_asic : 1; /* RW, W1C */ + unsigned long ltc_int : 1; /* RW, W1C */ + unsigned long la_seq_trigger : 1; /* RW, W1C */ + unsigned long ipi_int : 1; /* RW, W1C */ + unsigned long extio_int0 : 1; /* RW, W1C */ + unsigned long extio_int1 : 1; /* RW, W1C */ + unsigned long extio_int2 : 1; /* RW, W1C */ + unsigned long extio_int3 : 1; /* RW, W1C */ + unsigned long profile_int : 1; /* RW, W1C */ + unsigned long rtc0 : 1; /* RW, W1C */ + unsigned long rtc1 : 1; /* RW, W1C */ + unsigned long rtc2 : 1; /* RW, W1C */ + unsigned long rtc3 : 1; /* RW, W1C */ + unsigned long bau_data : 1; /* RW, W1C */ + unsigned long power_management_req : 1; /* RW, W1C */ + unsigned long rsvd_57_63 : 7; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_EVENT_OCCURRED0_ALIAS */ +/* ========================================================================= */ +#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 + +/* ========================================================================= */ +/* UVH_INT_CMPB */ +/* ========================================================================= */ +#define UVH_INT_CMPB 0x22080UL + +#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 +#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpb_u { + unsigned long v; + struct uvh_int_cmpb_s { + unsigned long real_time_cmpb : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_INT_CMPC */ +/* ========================================================================= */ +#define UVH_INT_CMPC 0x22100UL + +#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpc_u { + unsigned long v; + struct uvh_int_cmpc_s { + unsigned long real_time_cmpc : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ +/* UVH_INT_CMPD */ +/* ========================================================================= */ +#define UVH_INT_CMPD 0x22180UL + +#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL + +union uvh_int_cmpd_u { + unsigned long v; + struct uvh_int_cmpd_s { + unsigned long real_time_cmpd : 56; /* RW */ + unsigned long rsvd_56_63 : 8; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_IPI_INT */ /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x0360 +#define UVH_IPI_INT_32 0x0348 #define UVH_IPI_INT_VECTOR_SHFT 0 #define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL @@ -86,7 +324,7 @@ union uvh_ipi_int_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009f0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL @@ -108,7 +346,7 @@ union uvh_lb_bau_intd_payload_queue_first_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009f8 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL @@ -126,7 +364,7 @@ union uvh_lb_bau_intd_payload_queue_last_u { /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x00a00 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL @@ -144,7 +382,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0aa0 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL @@ -205,13 +443,13 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0aa8 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009d8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL @@ -234,7 +472,7 @@ union uvh_lb_bau_sb_activation_control_u { /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009e0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL @@ -250,7 +488,7 @@ union uvh_lb_bau_sb_activation_status_0_u { /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009e8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL @@ -266,7 +504,7 @@ union uvh_lb_bau_sb_activation_status_1_u { /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ #define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009d0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL @@ -333,46 +571,48 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL #define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21 #define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_SHFT 22 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_TIMEOUT_MASK 0x0000000000400000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 23 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000000800000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 24 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000001000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 25 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000002000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 26 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000004000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 27 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000008000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 28 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000010000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 29 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000020000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 30 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000040000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 31 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000080000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 32 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000100000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 33 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000200000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 34 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000400000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 35 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000000800000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 36 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000001000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 37 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000002000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 38 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000004000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 39 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000008000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 40 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000010000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 41 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000020000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42 +#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL union uvh_lb_mcast_aoerr0_rpt_enable_u { unsigned long v; @@ -399,7 +639,8 @@ union uvh_lb_mcast_aoerr0_rpt_enable_u { unsigned long macc_rep_runt_msg : 1; /* RW */ unsigned long macc_rep_obese_msg : 1; /* RW */ unsigned long macc_rep_data_sb_err : 1; /* RW */ - unsigned long macc_timeout : 1; /* RW */ + unsigned long macc_amo_timeout : 1; /* RW */ + unsigned long macc_put_timeout : 1; /* RW */ unsigned long macc_spurious_event : 1; /* RW */ unsigned long ioh_destination_table_parity : 1; /* RW */ unsigned long get_had_error_reply : 1; /* RW */ @@ -419,7 +660,7 @@ union uvh_lb_mcast_aoerr0_rpt_enable_u { unsigned long int_rep_obese_msg : 1; /* RW */ unsigned long int_rep_command_err : 1; /* RW */ unsigned long int_timeout : 1; /* RW */ - unsigned long rsvd_42_63 : 22; /* */ + unsigned long rsvd_43_63 : 21; /* */ } s; }; @@ -733,14 +974,34 @@ union uvh_rh_gam_cfg_overlay_config_mmr_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL + +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_cfg_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_cfg_overlay_config_mmr_s { + unsigned long rsvd_0_25: 26; /* */ + unsigned long base : 20; /* RW */ + unsigned long rsvd_46_62: 17; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 46 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0000400000000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 @@ -751,8 +1012,9 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { struct uvh_rh_gam_gru_overlay_config_mmr_s { unsigned long rsvd_0_27: 28; /* */ unsigned long base : 18; /* RW */ + unsigned long rsvd_46_47: 2; /* */ unsigned long gr4 : 1; /* RW */ - unsigned long rsvd_47_51: 5; /* */ + unsigned long rsvd_49_51: 3; /* */ unsigned long n_gru : 4; /* RW */ unsigned long rsvd_56_62: 7; /* */ unsigned long enable : 1; /* RW */ @@ -786,6 +1048,32 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_mmioh_overlay_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_29: 30; /* */ + unsigned long base : 16; /* RW */ + unsigned long m_io : 6; /* RW */ + unsigned long n_io : 4; /* RW */ + unsigned long rsvd_56_62: 7; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL @@ -811,7 +1099,7 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UVH_RTC 0x28000UL +#define UVH_RTC 0x340000UL #define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 #define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL @@ -825,6 +1113,139 @@ union uvh_rtc_u { }; /* ========================================================================= */ +/* UVH_RTC1_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC1_INT_CONFIG 0x615c0UL + +#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC1_INT_CONFIG_P_SHFT 13 +#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC1_INT_CONFIG_T_SHFT 15 +#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC1_INT_CONFIG_M_SHFT 16 +#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc1_int_config_u { + unsigned long v; + struct uvh_rtc1_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC2_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC2_INT_CONFIG 0x61600UL + +#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC2_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC2_INT_CONFIG_P_SHFT 13 +#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC2_INT_CONFIG_T_SHFT 15 +#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC2_INT_CONFIG_M_SHFT 16 +#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc2_int_config_u { + unsigned long v; + struct uvh_rtc2_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC3_INT_CONFIG */ +/* ========================================================================= */ +#define UVH_RTC3_INT_CONFIG 0x61640UL + +#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC3_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC3_INT_CONFIG_P_SHFT 13 +#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC3_INT_CONFIG_T_SHFT 15 +#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC3_INT_CONFIG_M_SHFT 16 +#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + +union uvh_rtc3_int_config_u { + unsigned long v; + struct uvh_rtc3_int_config_s { + unsigned long vector_ : 8; /* RW */ + unsigned long dm : 3; /* RW */ + unsigned long destmode : 1; /* RW */ + unsigned long status : 1; /* RO */ + unsigned long p : 1; /* RO */ + unsigned long rsvd_14 : 1; /* */ + unsigned long t : 1; /* RO */ + unsigned long m : 1; /* RW */ + unsigned long rsvd_17_31: 15; /* */ + unsigned long apic_id : 32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RTC_INC_RATIO */ +/* ========================================================================= */ +#define UVH_RTC_INC_RATIO 0x350000UL + +#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0 +#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL +#define UVH_RTC_INC_RATIO_RATIO_SHFT 20 +#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL + +union uvh_rtc_inc_ratio_u { + unsigned long v; + struct uvh_rtc_inc_ratio_s { + unsigned long fraction : 20; /* RW */ + unsigned long ratio : 3; /* RW */ + unsigned long rsvd_23_63: 41; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_SI_ADDR_MAP_CONFIG */ /* ========================================================================= */ #define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL -- cgit v1.1 From 746f2eb790e75676ddc3b816ba18bac4179cc744 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov <gorcunov@gmail.com> Date: Tue, 1 Jul 2008 21:43:52 +0400 Subject: x86: apic_32.c - add lapic resource Add lapic resource into kernel resource map and mark it as busy Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com> Cc: "Maciej W. Rozycki" <macro@linux-mips.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> CC: Maciej W. Rozycki <macro@linux-mips.org> --- arch/x86/kernel/apic_32.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6dea830..3e94720 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -82,6 +82,11 @@ int pic_mode; /* Have we found an MP table */ int smp_found_config; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + static unsigned int calibration_result; static int lapic_next_event(unsigned long delta, @@ -1720,3 +1725,21 @@ static int __init apic_set_verbosity(char *str) } __setup("apic=", apic_set_verbosity); +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); -- cgit v1.1 From 0ef95533326a7b37d16025af9edc0c18e644b346 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 1 Jul 2008 11:43:18 -0700 Subject: x86: merge sched_clock handling Move the basic global variable definitions and sched_clock handling in the common "tsc.c" file. - Unify notsc kernel command line handling for 32 bit and 64bit. - Functional changes for 64bit. - "tsc_disabled" is updated if "notsc" is passed at boottime. - Fallback to jiffies for sched_clock, incase notsc is passed on commandline. Signed-off-by: Alok N Kataria <akataria@vmware.com> Signed-off-by: Dan Hecht <dhecht@vmware.com> Cc: Dan Hecht <dhecht@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/time_32.c | 3 -- arch/x86/kernel/tsc.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/tsc_32.c | 86 ++--------------------------------------------- arch/x86/kernel/tsc_64.c | 67 ++++++------------------------------ 5 files changed, 100 insertions(+), 144 deletions(-) create mode 100644 arch/x86/kernel/tsc.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 54829e2..ca904ee 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-y += tsc_$(BITS).o io_delay.o rtc.o +obj-y += tsc_$(BITS).o io_delay.o rtc.o tsc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5f29f12..059ca6e 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -39,9 +39,6 @@ #include "do_timer.h" -unsigned int cpu_khz; /* Detected as we calibrate the TSC */ -EXPORT_SYMBOL(cpu_khz); - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 0000000..5d0be77 --- /dev/null +++ b/arch/x86/kernel/tsc.c @@ -0,0 +1,86 @@ +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/timer.h> + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); + +/* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +int tsc_unstable; + +/* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +int tsc_disabled = -1; + +/* + * Scheduler clock - returns current time in nanosec units. + */ +u64 native_sched_clock(void) +{ + u64 this_offset; + + /* + * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achive it. ) + */ + if (unlikely(tsc_disabled)) { + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + } + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + +int check_tsc_unstable(void) +{ + return tsc_unstable; +} +EXPORT_SYMBOL_GPL(check_tsc_unstable); + +#ifdef CONFIG_X86_TSC +int __init notsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC completely.\n"); + tsc_disabled = 1; + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +int __init notsc_setup(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_TSC); + return 1; +} +#endif + +__setup("notsc", notsc_setup); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 6240922..dc89900 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -15,52 +15,8 @@ #include "mach_timer.h" -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ -static int tsc_disabled = -1; - -/* - * On some systems the TSC frequency does not - * change with the cpu frequency. So we need - * an extra value to store the TSC freq - */ -unsigned int tsc_khz; -EXPORT_SYMBOL_GPL(tsc_khz); - -#ifdef CONFIG_X86_TSC -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); - tsc_disabled = 1; - return 1; -} -#else -/* - * disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c - */ -static int __init tsc_setup(char *str) -{ - setup_clear_cpu_cap(X86_FEATURE_TSC); - return 1; -} -#endif - -__setup("notsc", tsc_setup); - -/* - * code to mark and check if the TSC is unstable - * due to cpufreq or due to unsynced TSCs - */ -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); +extern int tsc_unstable; +extern int tsc_disabled; /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) @@ -109,44 +65,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long native_sched_clock(void) -{ - unsigned long long this_offset; - - /* - * Fall back to jiffies if there's no TSC available: - * ( But note that we still use it if the TSC is marked - * unstable. We do this because unlike Time Of Day, - * the scheduler clock tolerates small errors and it's - * very important for it to be as fast as the platform - * can achive it. ) - */ - if (unlikely(tsc_disabled)) - /* No locking but a rare wrong value is not a big deal: */ - return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); - - /* read the Time Stamp Counter: */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long sched_clock(void) - __attribute__((alias("native_sched_clock"))); -#endif - unsigned long native_calculate_cpu_khz(void) { unsigned long long start, end; diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 9898fb0..69cbe4c 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -13,12 +13,8 @@ #include <asm/timer.h> #include <asm/vgtod.h> -static int notsc __initdata = 0; - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -unsigned int tsc_khz; -EXPORT_SYMBOL(tsc_khz); +extern int tsc_unstable; +extern int tsc_disabled; /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) @@ -41,6 +37,7 @@ EXPORT_SYMBOL(tsc_khz); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ + DEFINE_PER_CPU(unsigned long, cyc2ns); static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) @@ -63,41 +60,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } -unsigned long long native_sched_clock(void) -{ - unsigned long a = 0; - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - * which means it is not completely exact and may not be monotonous - * between CPUs. But the errors should be too small to matter for - * scheduling purposes. - */ - - rdtscll(a); - return cycles_2_ns(a); -} - -/* We need to define a real function for sched_clock, to override the - weak default version */ -#ifdef CONFIG_PARAVIRT -unsigned long long sched_clock(void) -{ - return paravirt_sched_clock(); -} -#else -unsigned long long -sched_clock(void) __attribute__((alias("native_sched_clock"))); -#endif - - -static int tsc_unstable; - -int check_tsc_unstable(void) -{ - return tsc_unstable; -} -EXPORT_SYMBOL_GPL(check_tsc_unstable); - #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -281,14 +243,6 @@ __cpuinit int unsynchronized_tsc(void) return num_present_cpus() > 1; } -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); - static struct clocksource clocksource_tsc; /* @@ -346,12 +300,13 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); void __init init_tsc_clocksource(void) { - if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - if (check_tsc_unstable()) - clocksource_tsc.rating = 0; + if (tsc_disabled > 0) + return; - clocksource_register(&clocksource_tsc); - } + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); + if (check_tsc_unstable()) + clocksource_tsc.rating = 0; + + clocksource_register(&clocksource_tsc); } -- cgit v1.1 From bfc0f5947afa5e3a13e55867f4478c8a92c11dca Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 1 Jul 2008 11:43:24 -0700 Subject: x86: merge tsc calibration Merge the tsc calibration code for the 32bit and 64bit kernel. The paravirtualized calculate_cpu_khz for 64bit now points to the correct tsc_calibrate code as in 32bit. Original native_calculate_cpu_khz for 64 bit is now called as calibrate_cpu. Also moved the recalibrate_cpu_khz function in the common file. Note that this function is called only from powernow K7 cpu freq driver. Signed-off-by: Alok N Kataria <akataria@vmware.com> Signed-off-by: Dan Hecht <dhecht@vmware.com> Cc: Dan Hecht <dhecht@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/time_64.c | 26 ++++++--- arch/x86/kernel/tsc.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/tsc_32.c | 74 +------------------------- arch/x86/kernel/tsc_64.c | 94 +-------------------------------- include/asm-x86/hpet.h | 2 +- include/asm-x86/tsc.h | 1 - 6 files changed, 154 insertions(+), 174 deletions(-) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 39ae851..c6ac4da 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -unsigned long __init native_calculate_cpu_khz(void) +static unsigned long __init calibrate_cpu(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -114,14 +114,18 @@ void __init hpet_time_init(void) setup_irq(0, &irq0); } +extern void set_cyc2ns_scale(unsigned long cpu_khz, int cpu); + void __init time_init(void) { - tsc_calibrate(); + int cpu; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; - cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calculate_cpu_khz(); + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calibrate_cpu(); lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; @@ -134,7 +138,17 @@ void __init time_init(void) vgetcpu_mode = VGETCPU_LSL; printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); + cpu_khz / 1000, cpu_khz % 1000); + + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + init_tsc_clocksource(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5d0be77..e6ee145 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,7 +1,11 @@ +#include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/module.h> #include <linux/timer.h> +#include <linux/acpi_pmtmr.h> + +#include <asm/hpet.h> unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -84,3 +88,130 @@ int __init notsc_setup(char *str) #endif __setup("notsc", notsc_setup); + +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +{ + u64 t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles(); + if (hpet) + *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *pm = acpi_pm_read_early(); + t2 = get_cycles(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULLONG_MAX; +} + +/** + * tsc_calibrate - calibrate the tsc on boot + */ +static unsigned int __init tsc_calibrate(void) +{ + unsigned long flags; + u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; + int hpet = is_hpet_enabled(); + unsigned int tsc_khz_val = 0; + + local_irq_save(flags); + + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + tr1 = get_cycles(); + while ((inb(0x61) & 0x20) == 0); + tr2 = get_cycles(); + + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Preset the result with the raw and inaccurate PIT + * calibration value + */ + delta = (tr2 - tr1); + do_div(delta, 50); + tsc_khz_val = delta; + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) { + printk(KERN_INFO "TSC calibrated against PIT\n"); + goto out; + } + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { + printk(KERN_WARNING "TSC calibration disturbed by SMI, " + "using PIT calibration result\n"); + goto out; + } + + tsc2 = (tsc2 - tsc1) * 1000000LL; + + if (hpet) { + printk(KERN_INFO "TSC calibrated against HPET\n"); + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tsc1, 1000000); + } else { + printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = pm2 * 1000000000LL; + do_div(tsc1, PMTMR_TICKS_PER_SEC); + } + + do_div(tsc2, tsc1); + tsc_khz_val = tsc2; + +out: + return tsc_khz_val; +} + +unsigned long native_calculate_cpu_khz(void) +{ + return tsc_calibrate(); +} + +#ifdef CONFIG_X86_32 +/* Only called from the Powernow K7 cpu freq driver */ +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data(0).loops_per_jiffy = + cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index dc89900..40c0aaf 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -42,7 +42,7 @@ extern int tsc_disabled; DEFINE_PER_CPU(unsigned long, cyc2ns); -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { unsigned long long tsc_now, ns_now; unsigned long flags, *scale; @@ -65,78 +65,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } -unsigned long native_calculate_cpu_khz(void) -{ - unsigned long long start, end; - unsigned long count; - u64 delta64 = (u64)ULLONG_MAX; - int i; - unsigned long flags; - - local_irq_save(flags); - - /* run 3 times to ensure the cache is warm and to get an accurate reading */ - for (i = 0; i < 3; i++) { - mach_prepare_counter(); - rdtscll(start); - mach_countup(&count); - rdtscll(end); - - /* - * Error: ECTCNEVERSET - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - if (count <= 1) - continue; - - /* cpu freq too slow: */ - if ((end - start) <= CALIBRATE_TIME_MSEC) - continue; - - /* - * We want the minimum time of all runs in case one of them - * is inaccurate due to SMI or other delay - */ - delta64 = min(delta64, (end - start)); - } - - /* cpu freq too fast (or every run was bad): */ - if (delta64 > (1ULL<<32)) - goto err; - - delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ - do_div(delta64,CALIBRATE_TIME_MSEC); - - local_irq_restore(flags); - return (unsigned long)delta64; -err: - local_irq_restore(flags); - return 0; -} - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} - -EXPORT_SYMBOL(recalibrate_cpu_khz); - #ifdef CONFIG_CPU_FREQ /* diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 69cbe4c..c852ff9 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -40,7 +40,7 @@ extern int tsc_disabled; DEFINE_PER_CPU(unsigned long, cyc2ns); -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { unsigned long long tsc_now, ns_now; unsigned long flags, *scale; @@ -130,98 +130,6 @@ core_initcall(cpufreq_tsc); #endif -#define MAX_RETRIES 5 -#define SMI_TRESHOLD 50000 - -/* - * Read TSC and the reference counters. Take care of SMI disturbance - */ -static unsigned long __init tsc_read_refs(unsigned long *pm, - unsigned long *hpet) -{ - unsigned long t1, t2; - int i; - - for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles(); - if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; - else - *pm = acpi_pm_read_early(); - t2 = get_cycles(); - if ((t2 - t1) < SMI_TRESHOLD) - return t2; - } - return ULONG_MAX; -} - -/** - * tsc_calibrate - calibrate the tsc on boot - */ -void __init tsc_calibrate(void) -{ - unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(), cpu; - - local_irq_save(flags); - - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); - - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); - - local_irq_restore(flags); - - /* - * Preset the result with the raw and inaccurate PIT - * calibration value - */ - tsc_khz = (tr2 - tr1) / 50; - - /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; - } - - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; - } - - tsc2 = (tsc2 - tsc1) * 1000000L; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000UL; - hpet2 -= hpet1; - tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000; - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = (pm2 * 1000000000) / PMTMR_TICKS_PER_SEC; - } - - tsc_khz = tsc2 / tsc1; - -out: - for_each_possible_cpu(cpu) - set_cyc2ns_scale(tsc_khz, cpu); -} - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h index 6a9b4ac..82f1ac6 100644 --- a/include/asm-x86/hpet.h +++ b/include/asm-x86/hpet.h @@ -86,8 +86,8 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler); #else /* CONFIG_HPET_TIMER */ static inline int hpet_enable(void) { return 0; } -static inline unsigned long hpet_readl(unsigned long a) { return 0; } static inline int is_hpet_enabled(void) { return 0; } +#define hpet_readl(a) 0 #endif #endif /* ASM_X86_HPET_H */ diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h index 548873a..761054d 100644 --- a/include/asm-x86/tsc.h +++ b/include/asm-x86/tsc.h @@ -58,7 +58,6 @@ int check_tsc_unstable(void); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); -extern void tsc_calibrate(void); extern int notsc_setup(char *); #endif -- cgit v1.1 From 2dbe06faf37b39f9ecffc054dd173b2a1dc2adcd Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 1 Jul 2008 11:43:31 -0700 Subject: x86: merge the TSC cpu-freq code Unify the TSC cpufreq code. Signed-off-by: Alok N Kataria <akataria@vmware.com> Signed-off-by: Dan Hecht <dhecht@vmware.com> Cc: Dan Hecht <dhecht@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/tsc.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/tsc_32.c | 113 ---------------------------------------------- arch/x86/kernel/tsc_64.c | 114 ----------------------------------------------- 3 files changed, 114 insertions(+), 227 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e6ee145..595f78a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> +#include <linux/cpufreq.h> #include <asm/hpet.h> @@ -215,3 +216,116 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); #endif /* CONFIG_X86_32 */ + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); + +void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data(freq->cpu).loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + tsc_khz_ref = tsc_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable("cpufreq changes"); + } + + set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif /* CONFIG_CPU_FREQ */ diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 40c0aaf..bbc153d 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -18,119 +18,6 @@ extern int tsc_unstable; extern int tsc_disabled; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); - -void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - /* - * Start smoothly with the new frequency: - */ - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -#ifdef CONFIG_CPU_FREQ - -/* - * if the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long cpu_khz_ref; - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = data; - - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - return 0; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy; - cpu_khz_ref = cpu_khz; - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data(freq->cpu).loops_per_jiffy = - cpufreq_scale(loops_per_jiffy_ref, - ref_freq, freq->new); - - if (cpu_khz) { - - if (num_online_cpus() == 1) - cpu_khz = cpufreq_scale(cpu_khz_ref, - ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz, freq->cpu); - /* - * TSC based sched_clock turns - * to junk w/ cpufreq - */ - mark_tsc_unstable("cpufreq changes"); - } - } - } - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - return cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); -} -core_initcall(cpufreq_tsc); - -#endif - /* clock source code */ static struct clocksource clocksource_tsc; diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index c852ff9..80a274b 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -16,120 +16,6 @@ extern int tsc_unstable; extern int tsc_disabled; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); - -void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - * changes. - * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. - * - * Should fix up last_tsc too. Currently gettimeofday in the - * first tick after the change will be slightly wrong. - */ - -static unsigned int ref_freq; -static unsigned long loops_per_jiffy_ref; -static unsigned long tsc_khz_ref; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - tsc_khz_ref = tsc_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable("cpufreq changes"); - } - - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. -- cgit v1.1 From 8fbbc4b45ce3e4c0eeb15004c79c72b6896a79c2 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 1 Jul 2008 11:43:34 -0700 Subject: x86: merge tsc_init and clocksource code Unify the clocksource code. Unify the tsc_init code. Signed-off-by: Alok N Kataria <akataria@vmware.com> Signed-off-by: Dan Hecht <dhecht@vmware.com> Cc: Dan Hecht <dhecht@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/time_64.c | 32 +------ arch/x86/kernel/tsc.c | 212 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/tsc_32.c | 188 ---------------------------------------- arch/x86/kernel/tsc_64.c | 106 ----------------------- include/asm-x86/apic.h | 7 +- include/asm-x86/delay.h | 4 + include/asm-x86/time.h | 2 + include/asm-x86/tsc.h | 1 - 9 files changed, 224 insertions(+), 330 deletions(-) delete mode 100644 arch/x86/kernel/tsc_32.c delete mode 100644 arch/x86/kernel/tsc_64.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ca904ee..59b14c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o -obj-y += tsc_$(BITS).o io_delay.o rtc.o tsc.o +obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index c6ac4da..e3d49c5 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -56,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -static unsigned long __init calibrate_cpu(void) +unsigned long __init calibrate_cpu(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -114,41 +114,13 @@ void __init hpet_time_init(void) setup_irq(0, &irq0); } -extern void set_cyc2ns_scale(unsigned long cpu_khz, int cpu); - void __init time_init(void) { - int cpu; - - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calibrate_cpu(); - - lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; - - if (unsynchronized_tsc()) - mark_tsc_unstable("TSCs unsynchronized"); - + tsc_init(); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; else vgetcpu_mode = VGETCPU_LSL; - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - for_each_possible_cpu(cpu) - set_cyc2ns_scale(cpu_khz, cpu); - - init_tsc_clocksource(); late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 595f78a..94c16bd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -5,8 +5,16 @@ #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> +#include <linux/dmi.h> +#include <linux/delay.h> +#include <linux/clocksource.h> +#include <linux/percpu.h> #include <asm/hpet.h> +#include <asm/timer.h> +#include <asm/vgtod.h> +#include <asm/time.h> +#include <asm/delay.h> unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -16,12 +24,12 @@ EXPORT_SYMBOL(tsc_khz); /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ -int tsc_unstable; +static int tsc_unstable; /* native_sched_clock() is called before tsc_init(), so we must start with the TSC soft disabled to prevent erroneous rdtsc usage on !cpu_has_tsc processors */ -int tsc_disabled = -1; +static int tsc_disabled = -1; /* * Scheduler clock - returns current time in nanosec units. @@ -241,7 +249,7 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); DEFINE_PER_CPU(unsigned long, cyc2ns); -void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { unsigned long long tsc_now, ns_now; unsigned long flags, *scale; @@ -329,3 +337,201 @@ static int __init cpufreq_tsc(void) core_initcall(cpufreq_tsc); #endif /* CONFIG_CPU_FREQ */ + +/* clocksource code */ + +static struct clocksource clocksource_tsc; + +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles(); + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)vget_cycles(); + + return ret >= __vsyscall_gtod_data.clock.cycle_last ? + ret : __vsyscall_gtod_data.clock.cycle_last; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, +#ifdef CONFIG_X86_64 + .vread = vread_tsc, +#endif +}; + +void mark_tsc_unstable(char *reason) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_change_rating(&clocksource_tsc, 0); + else + clocksource_tsc.rating = 0; + } +} + +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + tsc_unstable = 1; + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +/* + * Geode_LX - the OLPC CPU has a possibly a very reliable TSC + */ +#ifdef CONFIG_MGEODE_LX +/* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + +static void __init check_geode_tsc_reliable(void) +{ + unsigned long res_low, res_high; + + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + if (res_low & RTSC_SUSP) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} +#else +static inline void check_geode_tsc_reliable(void) { } +#endif + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ + if (!cpu_has_tsc || tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + tsc_unstable = 1; + } + + return tsc_unstable; +} + +static void __init init_tsc_clocksource(void) +{ + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) { + clocksource_tsc.rating = 0; + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } + clocksource_register(&clocksource_tsc); +} + +void __init tsc_init(void) +{ + u64 lpj; + int cpu; + + if (!cpu_has_tsc) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) { + mark_tsc_unstable("could not calculate TSC khz"); + return; + } + +#ifdef CONFIG_X86_64 + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calibrate_cpu(); +#endif + + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_fine = lpj; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + + if (tsc_disabled > 0) + return; + + /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + + use_tsc_delay(); + /* Check and install the TSC clocksource */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) + mark_tsc_unstable("TSCs unsynchronized"); + + check_geode_tsc_reliable(); + init_tsc_clocksource(); +} + diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c deleted file mode 100644 index bbc153d..0000000 --- a/arch/x86/kernel/tsc_32.c +++ /dev/null @@ -1,188 +0,0 @@ -#include <linux/sched.h> -#include <linux/clocksource.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/jiffies.h> -#include <linux/init.h> -#include <linux/dmi.h> -#include <linux/percpu.h> - -#include <asm/delay.h> -#include <asm/tsc.h> -#include <asm/io.h> -#include <asm/timer.h> - -#include "mach_timer.h" - -extern int tsc_unstable; -extern int tsc_disabled; - -/* clock source code */ - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp issue. This can be observed in - * a very small window right after one CPU updated cycle_last under - * xtime lock and the other CPU reads a TSC value which is smaller - * than the cycle_last reference value due to a TSC which is slighty - * behind. This delta is nowhere else observable, but in that case it - * results in a forward time jump in the range of hours due to the - * unsigned delta calculation of the time keeping core code, which is - * necessary to support wrapping clocksources like pm timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret; - - rdtscll(ret); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to: %s.\n", reason); - /* Can be called before registration */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", - d->ident); - tsc_unstable = 1; - return 0; -} - -/* List of systems that have known TSC problems */ -static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { - { - .callback = dmi_mark_tsc_unstable, - .ident = "IBM Thinkpad 380XD", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), - }, - }, - {} -}; - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (!cpu_has_tsc || tsc_unstable) - return 1; - - /* Anything with constant TSC should be synchronized */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* - * Intel systems are normally all synchronized. - * Exceptions must mark TSC as unstable: - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) - tsc_unstable = 1; - } - return tsc_unstable; -} - -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ -#ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ -#define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ - unsigned long res_low, res_high; - - rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } -#endif - - -void __init tsc_init(void) -{ - int cpu; - u64 lpj; - - if (!cpu_has_tsc || tsc_disabled > 0) - return; - - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; - - if (!cpu_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - return; - } - - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - - /* now allow native_sched_clock() to use rdtsc */ - tsc_disabled = 0; - - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); - - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - for_each_possible_cpu(cpu) - set_cyc2ns_scale(cpu_khz, cpu); - - use_tsc_delay(); - - /* Check and install the TSC clocksource */ - dmi_check_system(bad_tsc_dmi_table); - - unsynchronized_tsc(); - check_geode_tsc_reliable(); - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } - clocksource_register(&clocksource_tsc); -} diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c deleted file mode 100644 index 80a274b..0000000 --- a/arch/x86/kernel/tsc_64.c +++ /dev/null @@ -1,106 +0,0 @@ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/clocksource.h> -#include <linux/time.h> -#include <linux/acpi.h> -#include <linux/cpufreq.h> -#include <linux/acpi_pmtmr.h> - -#include <asm/hpet.h> -#include <asm/timex.h> -#include <asm/timer.h> -#include <asm/vgtod.h> - -extern int tsc_unstable; -extern int tsc_disabled; - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ - if (tsc_unstable) - return 1; - -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return 0; - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -static struct clocksource clocksource_tsc; - -/* - * We compare the TSC to the cycle_last value in the clocksource - * structure to avoid a nasty time-warp. This can be observed in a - * very small window right after one CPU updated cycle_last under - * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which - * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in - * that case it results in a forward time jump in the range of hours - * due to the unsigned delta calculation of the time keeping core - * code, which is necessary to support wrapping clocksources like pm - * timer. - */ -static cycle_t read_tsc(void) -{ - cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret = (cycle_t)vget_cycles(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; -} - -static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY, - .vread = vread_tsc, -}; - -void mark_tsc_unstable(char *reason) -{ - if (!tsc_unstable) { - tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); - /* Change only the rating, when not registered */ - if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else - clocksource_tsc.rating = 0; - } -} -EXPORT_SYMBOL_GPL(mark_tsc_unstable); - -void __init init_tsc_clocksource(void) -{ - if (tsc_disabled > 0) - return; - - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); - if (check_tsc_unstable()) - clocksource_tsc.rating = 0; - - clocksource_register(&clocksource_tsc); -} diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index a298077..4e2c1e5 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -121,12 +121,17 @@ extern void enable_NMI_through_LVT0(void); */ #ifdef CONFIG_X86_64 extern void early_init_lapic_mapping(void); +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} #endif extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); -extern int apic_is_clustered_box(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } diff --git a/include/asm-x86/delay.h b/include/asm-x86/delay.h index 409a649..bb80880 100644 --- a/include/asm-x86/delay.h +++ b/include/asm-x86/delay.h @@ -26,6 +26,10 @@ extern void __delay(unsigned long loops); ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \ __ndelay(n)) +#ifdef CONFIG_X86_32 void use_tsc_delay(void); +#else +#define use_tsc_delay() {} +#endif #endif /* _ASM_X86_DELAY_H */ diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h index bce72d7..a17fa47 100644 --- a/include/asm-x86/time.h +++ b/include/asm-x86/time.h @@ -56,4 +56,6 @@ static inline int native_set_wallclock(unsigned long nowtime) #endif /* CONFIG_PARAVIRT */ +extern unsigned long __init calibrate_cpu(void); + #endif diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h index 761054d..cb6f6ee 100644 --- a/include/asm-x86/tsc.h +++ b/include/asm-x86/tsc.h @@ -48,7 +48,6 @@ static __always_inline cycles_t vget_cycles(void) extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); -extern void init_tsc_clocksource(void); int check_tsc_unstable(void); /* -- cgit v1.1 From e93ef949fd9a3f237aedfb8e64414b28980530b8 Mon Sep 17 00:00:00 2001 From: Alok Kataria <akataria@vmware.com> Date: Tue, 1 Jul 2008 11:43:36 -0700 Subject: x86: rename paravirtualized TSC functions Rename the paravirtualized calculate_cpu_khz to calibrate_tsc. In all cases, we actually calibrate_tsc and use that as the cpu_khz value. Signed-off-by: Alok N Kataria <akataria@vmware.com> Signed-off-by: Dan Hecht <dhecht@vmware.com> Cc: Dan Hecht <dhecht@vmware.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/tsc.c | 18 +++++++----------- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/vmiclock_32.c | 4 ++-- arch/x86/lguest/boot.c | 4 ++-- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/time.c | 4 ++-- arch/x86/xen/xen-ops.h | 2 +- include/asm-x86/paravirt.h | 4 ++-- include/asm-x86/timer.h | 4 ++-- include/asm-x86/vmi_time.h | 2 +- 11 files changed, 22 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e7e5652..e0f571d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -285,7 +285,7 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, + .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 94c16bd..3c36f92 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -123,9 +123,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) } /** - * tsc_calibrate - calibrate the tsc on boot + * native_calibrate_tsc - calibrate the tsc on boot */ -static unsigned int __init tsc_calibrate(void) +unsigned long native_calibrate_tsc(void) { unsigned long flags; u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; @@ -195,10 +195,6 @@ out: return tsc_khz_val; } -unsigned long native_calculate_cpu_khz(void) -{ - return tsc_calibrate(); -} #ifdef CONFIG_X86_32 /* Only called from the Powernow K7 cpu freq driver */ @@ -208,8 +204,8 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -487,10 +483,10 @@ void __init tsc_init(void) if (!cpu_has_tsc) return; - cpu_khz = calculate_cpu_khz(); - tsc_khz = cpu_khz; + tsc_khz = calibrate_tsc(); + cpu_khz = tsc_khz; - if (!cpu_khz) { + if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); return; } diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 946bf13..b153460 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -932,7 +932,7 @@ static inline int __init activate_vmi(void) pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.get_tsc_khz = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index ba7d19e..6953859 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -69,8 +69,8 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ -unsigned long vmi_cpu_khz(void) +/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +unsigned long vmi_tsc_khz(void) { unsigned long long khz; khz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index e72cf07..50dad44 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -607,7 +607,7 @@ static unsigned long lguest_get_wallclock(void) * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 * TSC clock will give up and not register itself. */ -static unsigned long lguest_cpu_khz(void) +static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } @@ -998,7 +998,7 @@ __init void lguest_init(void) /* time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; - pv_time_ops.get_cpu_khz = lguest_cpu_khz; + pv_time_ops.get_tsc_khz = lguest_tsc_khz; /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b98083..dcd4e51 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1062,7 +1062,7 @@ static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_cpu_khz = xen_cpu_khz, + .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 64f0038..685b774 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -197,8 +197,8 @@ unsigned long long xen_sched_clock(void) } -/* Get the CPU speed from Xen */ -unsigned long xen_cpu_khz(void) +/* Get the TSC speed from Xen */ +unsigned long xen_tsc_khz(void) { u64 xen_khz = 1000000ULL << 32; const struct pvclock_vcpu_time_info *info = diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a05559..d852ddb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -32,7 +32,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); -unsigned long xen_cpu_khz(void); +unsigned long xen_tsc_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index 6d8966f..ef5e8ec 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -84,7 +84,7 @@ struct pv_time_ops { int (*set_wallclock)(unsigned long); unsigned long long (*sched_clock)(void); - unsigned long (*get_cpu_khz)(void); + unsigned long (*get_tsc_khz)(void); }; struct pv_cpu_ops { @@ -779,7 +779,7 @@ static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calculate_cpu_khz() (pv_time_ops.get_cpu_khz()) +#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) static inline unsigned long long paravirt_read_pmc(int counter) { diff --git a/include/asm-x86/timer.h b/include/asm-x86/timer.h index 4f6fcb0..fb2a4dd 100644 --- a/include/asm-x86/timer.h +++ b/include/asm-x86/timer.h @@ -7,14 +7,14 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -unsigned long native_calculate_cpu_khz(void); +unsigned long native_calibrate_tsc(void); extern int timer_ack; extern int no_timer_check; extern int recalibrate_cpu_khz(void); #ifndef CONFIG_PARAVIRT -#define calculate_cpu_khz() native_calculate_cpu_khz() +#define calibrate_tsc() native_calibrate_tsc() #endif /* Accelerators for sched_clock() diff --git a/include/asm-x86/vmi_time.h b/include/asm-x86/vmi_time.h index 4781881..c3118c3 100644 --- a/include/asm-x86/vmi_time.h +++ b/include/asm-x86/vmi_time.h @@ -50,7 +50,7 @@ extern void __init vmi_time_init(void); extern unsigned long vmi_get_wallclock(void); extern int vmi_set_wallclock(unsigned long now); extern unsigned long long vmi_sched_clock(void); -extern unsigned long vmi_cpu_khz(void); +extern unsigned long vmi_tsc_khz(void); #ifdef CONFIG_X86_LOCAL_APIC extern void __devinit vmi_time_bsp_init(void); -- cgit v1.1 From a8c1be9d2e78d8608892c86791837acf12da4bf6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:29:44 +0200 Subject: x86: initial changes to unify traps_32.c and traps_64.c This patch does not change the generated object files. Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 78 ++++++------ arch/x86/kernel/traps_64.c | 309 ++++++++++++++++++++++----------------------- 2 files changed, 191 insertions(+), 196 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index d7cc292..9243969 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1,5 +1,6 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 @@ -130,7 +131,8 @@ void printk_address(unsigned long address, int reliable) #endif } -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size) { return p > (void *)tinfo && p <= (void *)tinfo + THREAD_SIZE - size; @@ -138,14 +140,14 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned s /* The form of the top of the frame on the stack */ struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; + struct stack_frame *next_frame; + unsigned long return_address; }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -167,8 +169,6 @@ print_context_stack(struct thread_info *tinfo, return bp; } -#define MSG(msg) ops->warning(data, msg) - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -178,7 +178,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; - stack = &dummy; if (task != current) stack = (unsigned long *)task->thread.sp; @@ -196,7 +195,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } #endif - while (1) { + for (;;) { struct thread_info *context; context = (struct thread_info *) @@ -248,10 +247,10 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, }; static void @@ -351,15 +350,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at EIP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } @@ -546,7 +544,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ { \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -562,7 +560,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } @@ -571,7 +569,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } @@ -586,22 +584,22 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_addr = (void __user *)siaddr; \ trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + == NOTIFY_STOP) \ return; \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR(3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) @@ -799,7 +797,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) + == NOTIFY_STOP) return; #ifdef CONFIG_X86_LOCAL_APIC /* @@ -818,6 +816,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -915,7 +915,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) tsk->thread.debugctlmsr = 0; if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ if (regs->flags & X86_EFLAGS_IF) @@ -997,7 +997,7 @@ void math_error(void __user *ip) * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't syncronizing its FPU usage + * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception */ @@ -1006,7 +1006,7 @@ void math_error(void __user *ip) switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ return; - default: /* Multiple exceptions */ + default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ /* @@ -1198,16 +1198,16 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ - set_system_gate(4, &overflow); - set_trap_gate(5, &bounds); - set_trap_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); + set_trap_gate(0, ÷_error); + set_intr_gate(1, &debug); + set_intr_gate(2, &nmi); + set_system_intr_gate(3, &int3); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_trap_gate(5, &bounds); + set_trap_gate(6, &invalid_op); + set_trap_gate(7, &device_not_available); + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); + set_trap_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 80ba6d3..686074e 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -205,8 +205,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -#define MSG(txt) ops->warning(data, txt) - /* * x86-64 can have up to three kernel stacks: * process stack @@ -233,11 +231,11 @@ struct stack_frame { unsigned long return_address; }; - -static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -259,7 +257,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, return bp; } -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, +void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { @@ -268,31 +266,29 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; - if (!tsk) - tsk = current; - tinfo = task_thread_info(tsk); + if (!task) + task = current; + tinfo = task_thread_info(task); if (!stack) { unsigned long dummy; stack = &dummy; - if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.sp; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER if (!bp) { - if (tsk == current) { + if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp):); + asm("movq %%rbp, %0" : "=r" (bp) :); } else { /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) tsk->thread.sp; + bp = *(unsigned long *) task->thread.sp; } } #endif - - /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -382,18 +378,17 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, - unsigned long bp) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, - unsigned long bp) +_show_stack(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp) { unsigned long *stack; int i; @@ -405,14 +400,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, // back trace for this cpu. if (sp == NULL) { - if (tsk) - sp = (unsigned long *)tsk->thread.sp; + if (task) + sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; } stack = sp; - for(i=0; i < kstack_depth_to_print; i++) { + for (i = 0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { stack = (unsigned long *) (irqstack_end[-1]); @@ -427,12 +422,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, sp, bp); + show_trace(task, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * sp) +void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(tsk, NULL, sp, 0); + _show_stack(task, NULL, sp, 0); } /* @@ -440,7 +435,7 @@ void show_stack(struct task_struct *tsk, unsigned long * sp) */ void dump_stack(void) { - unsigned long dummy; + unsigned long stack; unsigned long bp = 0; #ifdef CONFIG_FRAME_POINTER @@ -453,7 +448,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy, bp); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -488,7 +483,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at RIP */ - ip = (u8 *) regs->ip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } for (i = 0; i < code_len; i++, ip++) { @@ -504,7 +499,7 @@ void show_registers(struct pt_regs *regs) } } printk("\n"); -} +} int is_valid_bugaddr(unsigned long ip) { @@ -576,8 +571,10 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; + show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ @@ -589,7 +586,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err) return 0; } -void die(const char * str, struct pt_regs * regs, long err) +void die(const char * str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); @@ -606,8 +603,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == - NOTIFY_STOP) + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; flags = oops_begin(); @@ -629,9 +625,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGBUS); } -static void __kprobes do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, - siginfo_t *info) +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) { struct task_struct *tsk = current; @@ -676,38 +672,38 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) /* Runs on IST stack */ @@ -775,14 +771,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, } static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs * regs) +mem_parity_error(unsigned char reason, struct pt_regs *regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) - if(edac_handler_set()) { + if (edac_handler_set()) { edac_atomic_assert_error(); return; } @@ -799,7 +795,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) } static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs * regs) +io_check_error(unsigned char reason, struct pt_regs *regs) { printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -836,7 +832,7 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system. */ + /* Only the BSP gets external NMIs from the system. */ if (!cpu) reason = get_nmi_reason(); @@ -848,18 +844,17 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog_tick(regs,reason)) + if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs,cpu)) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; + return; /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) mem_parity_error(reason, regs); if (reason & 0x40) @@ -870,9 +865,12 @@ asmlinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); + add_pda(__nmi_count, 1); + if (!ignore_nmis) default_do_nmi(regs); + nmi_exit(); } @@ -889,13 +887,14 @@ void restart_nmi(void) } /* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) +asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) return; - } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); @@ -948,21 +947,19 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { + if (!tsk->thread.debugreg7) goto clear_dr7; - } } tsk->thread.debugreg6 = condition; - /* * Single-stepping through TF: make sure we ignore any events in * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if (!user_mode(regs)) + goto clear_TF_reenable; } /* Ok, finally something we can handle */ @@ -975,7 +972,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - set_debugreg(0UL, 7); + set_debugreg(0, 7); preempt_conditional_cli(regs); return; @@ -983,6 +980,7 @@ clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); + return; } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) @@ -1005,7 +1003,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) asmlinkage void do_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short cwd, swd; @@ -1038,30 +1036,30 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: /* No unmasked exception */ + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1074,7 +1072,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { void __user *ip = (void __user *)(regs->ip); - struct task_struct * task; + struct task_struct *task; siginfo_t info; unsigned short mxcsr; @@ -1102,25 +1100,25 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) */ mxcsr = get_fpu_mxcsr(task); switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; } force_sig_info(SIGFPE, &info, task); } @@ -1138,7 +1136,7 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) } /* - * 'math_state_restore()' saves the current math information in the + * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. @@ -1163,7 +1161,7 @@ asmlinkage void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + clts(); /* Allow maths ops (or we recurse) */ restore_fpu_checking(&me->thread.xstate->fxsave); task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; @@ -1172,64 +1170,61 @@ EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { - set_intr_gate(0,÷_error); - set_intr_gate_ist(1,&debug,DEBUG_STACK); - set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4,&overflow); /* int4 can be called from all */ - set_intr_gate(5,&bounds); - set_intr_gate(6,&invalid_op); - set_intr_gate(7,&device_not_available); - set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9,&coprocessor_segment_overrun); - set_intr_gate(10,&invalid_TSS); - set_intr_gate(11,&segment_not_present); - set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); - set_intr_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_intr_gate(15,&spurious_interrupt_bug); - set_intr_gate(16,&coprocessor_error); - set_intr_gate(17,&alignment_check); + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ + set_system_gate(4, &overflow); /* int4 can be called from all */ + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); + set_intr_gate(14, &page_fault); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate_ist(18,&machine_check, MCE_STACK); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif - set_intr_gate(19,&simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); #ifdef CONFIG_IA32_EMULATION set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif - /* * initialize the per thread extended state: */ init_thread_xstate(); /* - * Should be a barrier for any external CPU state. + * Should be a barrier for any external CPU state: */ cpu_init(); } - static int __init oops_setup(char *s) -{ +{ if (!s) return -EINVAL; if (!strcmp(s, "panic")) panic_on_oops = 1; return 0; -} +} early_param("oops", oops_setup); static int __init kstack_setup(char *s) { if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s,NULL,0); + kstack_depth_to_print = simple_strtoul(s, NULL, 0); return 0; } early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { code_bytes = simple_strtoul(s, NULL, 0); -- cgit v1.1 From badc76527f7e29302f0bde3d366c59101fb2ab87 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:30:30 +0200 Subject: x86: traps_xx: shuffle headers and globals Reorder headers and collect globals in traps_32.c and traps_64.c Code size and data size are unaffected by the changes. Code itself is changed due to different ordering of data and bss. The bss segment changed size due to a change in the packing of the variables. Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 9 +++---- arch/x86/kernel/traps_64.c | 61 +++++++++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 9243969..5339af4 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -61,8 +61,6 @@ #include "mach_traps.h" -int panic_on_unrecovered_nmi; - DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -99,8 +97,11 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +static int ignore_nmis; +static int die_counter; void printk_address(unsigned long address, int reliable) { @@ -382,8 +383,6 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static int die_counter; - int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -829,8 +828,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); } -static int ignore_nmis; - notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { int cpu; diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 686074e..03d63b0 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -10,49 +10,49 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'entry.S'. */ -#include <linux/sched.h> +#include <linux/moduleparam.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/kdebug.h> #include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> #include <linux/string.h> +#include <linux/unwind.h> +#include <linux/delay.h> #include <linux/errno.h> -#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/sched.h> #include <linux/timer.h> -#include <linux/mm.h> #include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/nmi.h> -#include <linux/kprobes.h> -#include <linux/kexec.h> -#include <linux/unwind.h> -#include <linux/uaccess.h> #include <linux/bug.h> -#include <linux/kdebug.h> -#include <linux/utsname.h> - -#include <mach_traps.h> +#include <linux/nmi.h> +#include <linux/mm.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <asm/system.h> -#include <asm/io.h> -#include <asm/atomic.h> +#include <asm/stacktrace.h> +#include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/unwind.h> #include <asm/desc.h> #include <asm/i387.h> -#include <asm/processor.h> -#include <asm/unwind.h> +#include <asm/nmi.h> #include <asm/smp.h> +#include <asm/io.h> #include <asm/pgalloc.h> -#include <asm/pda.h> #include <asm/proto.h> -#include <asm/nmi.h> -#include <asm/stacktrace.h> +#include <asm/pda.h> + +#include <mach_traps.h> asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -72,12 +72,14 @@ asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +asmlinkage void machine_check(void); int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; -static unsigned ignore_nmis; +static int ignore_nmis; +static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -101,8 +103,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -int kstack_depth_to_print = 12; - void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS @@ -559,7 +559,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) int __kprobes __die(const char * str, struct pt_regs * regs, long err) { - static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); -- cgit v1.1 From e423f49fc8ccd761618748d7139638d8ddc1d16f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:31:03 +0200 Subject: x86: traps_xx: modify __die if (cond) block -> if (!cond) goto end_of_block Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 30 +++++++++++++----------------- arch/x86/kernel/traps_64.c | 4 ++-- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5339af4..5afd94a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -399,26 +399,22 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; - return 0; + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; } - - return 1; + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; } /* diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 03d63b0..019a06f 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -557,9 +557,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) do_exit(signr); } -int __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif -- cgit v1.1 From a7bbb0ce1d1f956c8491b925c74767adf654f373 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:31:34 +0200 Subject: x86: traps_xx: modify do_trap if (cond) block -> if (!cond) goto end_of_block Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_64.c | 56 +++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 019a06f..1c2d533 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -630,38 +630,38 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; - if (user_mode(regs)) { - /* - * We want error_code and trap_no set for userspace - * faults and kernelspace faults which result in - * die(), but not kernelspace faults which are fixed - * up. die() gives the process no chance to handle - * the signal and notice the kernel fault information, - * so that won't result in polluting the information - * about previously queued, but not yet delivered, - * faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (!user_mode(regs)) + goto kernel_trap; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); } + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; +kernel_trap: if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; -- cgit v1.1 From 13485ab55bc77f02024e80bcca0374950b0becb3 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:32:04 +0200 Subject: x86: traps_xx: restructure do_general_protection() - if (cond) block -> if (!cond) goto end_of_block - local caching of current Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 36 ++++++++++++++++++++---------------- arch/x86/kernel/traps_64.c | 41 ++++++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5afd94a..c9a0120 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -598,8 +598,10 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) -void __kprobes do_general_protection(struct pt_regs *regs, long error_code) +void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { + struct task_struct *tsk; struct thread_struct *thread; struct tss_struct *tss; int cpu; @@ -640,23 +642,24 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) goto gp_in_vm86; + tsk = current; if (!user_mode(regs)) goto gp_in_kernel; - current->thread.error_code = error_code; - current->thread.trap_no = 13; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; - if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - current->comm, task_pid_nr(current), - regs->ip, regs->sp, error_code); + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); } - force_sig(SIGSEGV, current); + force_sig(SIGSEGV, tsk); return; gp_in_vm86: @@ -665,14 +668,15 @@ gp_in_vm86: return; gp_in_kernel: - if (!fixup_exception(regs)) { - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + return; + die("general protection fault", regs, error_code); } static notrace __kprobes void diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 1c2d533..bc21ddc 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -733,31 +733,34 @@ asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) die(str, regs, error_code); } -asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, - long error_code) +asmlinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) { - struct task_struct *tsk = current; + struct task_struct *tsk; conditional_sti(regs); - if (user_mode(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; - force_sig(SIGSEGV, tsk); - return; - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; +gp_in_kernel: if (fixup_exception(regs)) return; -- cgit v1.1 From abd348072798aa88d48fe9f182ac3440fcb7ae47 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 18:39:01 +0200 Subject: x86: traps_xx: modify default_do_nmi - local caching of smp_processor_id() in default_do_nmi() - v2: do not split default_do_nmi over two lines On Wed, Jul 02, 2008 at 08:12:20PM +0400, Cyrill Gorcunov wrote: > | -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) > | +static notrace __kprobes void > | +default_do_nmi(struct pt_regs *regs) > | [ ... ] > | -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) > | +asmlinkage notrace __kprobes void > | +default_do_nmi(struct pt_regs *regs) > > Hi Alexander, good done, thanks! But why did you split default_do_nmi > definition by two lines? I think it would be better to keep them as it > was before, ie by a single line > > static notrace __kprobes void default_do_nmi(struct pt_regs *regs) Thanks! Here is the replacement patch with default_do_nmi left on a single line. Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 9 ++++++--- arch/x86/kernel/traps_64.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index c9a0120..fb8e3cc 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -789,9 +789,12 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); - /* Only the BSP gets external NMIs from the system: */ - if (!smp_processor_id()) + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) reason = get_nmi_reason(); if (!(reason & 0xc0)) { @@ -805,7 +808,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) */ if (nmi_watchdog_tick(regs, reason)) return; - if (!do_nmi_callback(regs, smp_processor_id())) + if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index bc21ddc..9b9245f 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -827,7 +827,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) /* Runs on IST stack. This code must keep interrupts off all the time. Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) +asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; -- cgit v1.1 From 7b4fd4bb2e7fad2c41afab4683a44ab77f6f35d0 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum <heukelum@mailshack.com> Date: Wed, 2 Jul 2008 01:33:14 +0200 Subject: x86: traps_xx: various small changes - order of local variable declarations - minor code changes Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/traps_32.c | 15 +++++++-------- arch/x86/kernel/traps_64.c | 20 +++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index fb8e3cc..8a76897 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -106,13 +106,13 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS - char namebuf[KSYM_NAME_LEN]; unsigned long offset = 0; unsigned long symsize; const char *symname; - char reliab[4] = ""; - char *delim = ":"; char *modname; + char *delim = ":"; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -135,8 +135,8 @@ void printk_address(unsigned long address, int reliable) static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned int size) { - return p > (void *)tinfo && - p <= (void *)tinfo + THREAD_SIZE - size; + void *t = tinfo; + return p > t && p <= t + THREAD_SIZE - size; } /* The form of the top of the frame on the stack */ @@ -976,9 +976,8 @@ clear_TF_reenable: void math_error(void __user *ip) { struct task_struct *task; - unsigned short cwd; - unsigned short swd; siginfo_t info; + unsigned short cwd, swd; /* * Save the info for the exception handler and clear the error. @@ -1042,8 +1041,8 @@ void do_coprocessor_error(struct pt_regs *regs, long error_code) static void simd_math_error(void __user *ip) { struct task_struct *task; - unsigned short mxcsr; siginfo_t info; + unsigned short mxcsr; /* * Save the info for the exception handler and clear the error. diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 9b9245f..74e9929 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -268,7 +268,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!task) task = current; - tinfo = task_thread_info(task); if (!stack) { unsigned long dummy; @@ -294,6 +293,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * current stack address. If the stacks consist of nested * exceptions */ + tinfo = task_thread_info(task); for (;;) { char *id; unsigned long *estack_end; @@ -435,8 +435,8 @@ void show_stack(struct task_struct *task, unsigned long *sp) */ void dump_stack(void) { - unsigned long stack; unsigned long bp = 0; + unsigned long stack; #ifdef CONFIG_FRAME_POINTER if (!bp) @@ -459,12 +459,8 @@ void show_registers(struct pt_regs *regs) unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - u8 *ip; - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; sp = regs->sp; - ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -475,12 +471,18 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; + u8 *ip; + printk("Stack: "); _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); printk("\n"); printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { /* try starting at RIP */ ip = (u8 *)regs->ip; @@ -585,7 +587,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 0; } -void die(const char * str, struct pt_regs *regs, long err) +void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); @@ -927,8 +929,8 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) asmlinkage void __kprobes do_debug(struct pt_regs * regs, unsigned long error_code) { - unsigned long condition; struct task_struct *tsk = current; + unsigned long condition; siginfo_t info; trace_hardirqs_fixup(); @@ -1201,7 +1203,7 @@ void __init trap_init(void) /* * initialize the per thread extended state: */ - init_thread_xstate(); + init_thread_xstate(); /* * Should be a barrier for any external CPU state: */ -- cgit v1.1 From 6f585e01614f38195599c1bc5379d3c9dae6be47 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Fri, 4 Jul 2008 12:36:21 +0200 Subject: arch/x86/kernel/smpboot.c: fix warning arch/x86/kernel/smpboot.c: In function 'do_boot_cpu': arch/x86/kernel/smpboot.c:943: warning: label 'restore_state' defined but not used Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e1200b2..f35c2d8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -939,9 +939,9 @@ do_rest: inquire_remote_apic(apicid); } } - +#ifdef CONFIG_X86_64 restore_state: - +#endif if (boot_error) { /* Try to put things back the way they were before ... */ numa_remove_cpu(cpu); /* was set by numa_add_cpu */ -- cgit v1.1 From 5ed4273af8469ca4723d4bf1bcd3abe2cc792e9f Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Fri, 4 Jul 2008 12:36:21 +0200 Subject: arch/x86/mm/pgtable_32.c: remove unused variable `fixmaps' arch/x86/mm/pgtable_32.c:144: warning: 'fixmaps' defined but not used Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/pgtable_32.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 828907d..b4becbf 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -141,7 +141,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) __flush_tlb_one(vaddr); } -static int fixmaps; unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -- cgit v1.1 From e407dffd17dcb592e1605a2b3dbbb81f9f3cbc21 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 9 Jul 2008 08:00:15 +0200 Subject: x86, uv: build fix for "x86, uv: update x86 mmr list for SGI uv" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: In file included from arch/x86/kernel/genx2apic_uv_x.c:25: include/asm/uv/uv_mmrs.h:986: error: redefinition of ‘union uvh_rh_gam_cfg_overlay_config_mmr_u’ include/asm/uv/uv_mmrs.h:988: error: redefinition of ‘struct uvh_rh_gam_cfg_overlay_config_mmr_s’ include/asm/uv/uv_mmrs.h:1064: error: redefinition of ‘union uvh_rh_gam_mmioh_overlay_config_mmr_u’ include/asm/uv/uv_mmrs.h:1066: error: redefinition of ‘struct uvh_rh_gam_mmioh_overlay_config_mmr_s’ caused by duplicate section (cut & paste error) in commit 5d061e397db1 "x86, uv: update x86 mmr list for SGI uv". Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/uv/uv_mmrs.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index eb59201..a8a1ad5 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -974,26 +974,6 @@ union uvh_rh_gam_cfg_overlay_config_mmr_u { }; /* ========================================================================= */ -/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */ -/* ========================================================================= */ -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL - -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -union uvh_rh_gam_cfg_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_cfg_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long rsvd_46_62: 17; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL -- cgit v1.1 From 26e9e57b106445bbd8c965985e4e8af5293ae005 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 9 Jul 2008 08:02:54 +0200 Subject: x86, uv: build fix #2 for "x86, uv: update x86 mmr list for SGI uv" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: In file included from arch/x86/kernel/tlb_uv.c:14: include/asm/uv/uv_mmrs.h:986: error: redefinition of ‘union uvh_rh_gam_cfg_overlay_config_mmr_u’ include/asm/uv/uv_mmrs.h:988: error: redefinition of ‘struct uvh_rh_gam_cfg_overlay_config_mmr_s’ include/asm/uv/uv_mmrs.h:1064: error: redefinition of ‘union uvh_rh_gam_mmioh_overlay_config_mmr_u’ include/asm/uv/uv_mmrs.h:1066: error: redefinition of ‘struct uvh_rh_gam_mmioh_overlay_config_mmr_s’ caused by another duplicate section (cut & paste error) in commit 5d061e397db1 "x86, uv: update x86 mmr list for SGI uv". Signed-off-by: Ingo Molnar <mingo@elte.hu> --- include/asm-x86/uv/uv_mmrs.h | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/include/asm-x86/uv/uv_mmrs.h b/include/asm-x86/uv/uv_mmrs.h index a8a1ad5..151fd7f 100644 --- a/include/asm-x86/uv/uv_mmrs.h +++ b/include/asm-x86/uv/uv_mmrs.h @@ -1028,32 +1028,6 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { }; /* ========================================================================= */ -/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ -/* ========================================================================= */ -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL - -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -union uvh_rh_gam_mmioh_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_29: 30; /* */ - unsigned long base : 16; /* RW */ - unsigned long m_io : 6; /* RW */ - unsigned long n_io : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL -- cgit v1.1 From b50efd2a55fc1344654875369d458bb6838bd37a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 8 Jul 2008 01:41:05 -0700 Subject: x86: introduce page_size_mask for 64bit prepare for overmapped patch also printout last_map_addr together with end Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 98 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 57d5eff..7227a0a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -340,7 +340,8 @@ phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) } static unsigned long __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; @@ -365,7 +366,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) continue; } - if (cpu_has_pse) { + if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; set_pte((pte_t *)pmd, pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -383,20 +384,22 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, + unsigned long page_size_mask) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, address, end); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); return last_map_addr; } static unsigned long __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -418,11 +421,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) if (pud_val(*pud)) { if (!pud_large(*pud)) - last_map_addr = phys_pmd_update(pud, addr, end); + last_map_addr = phys_pmd_update(pud, addr, end, + page_size_mask); continue; } - if (direct_gbpages) { + if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -433,7 +437,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pmd_init(pmd, addr, end); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); @@ -446,13 +450,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) } static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end) +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { pud_t *pud; pud = (pud_t *)pgd_page_vaddr(*pgd); - return phys_pud_init(pud, addr, end); + return phys_pud_init(pud, addr, end, page_size_mask); } static void __init find_early_table_space(unsigned long end) @@ -608,29 +613,12 @@ static void __init early_memtest(unsigned long start, unsigned long end) } #endif -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) +static unsigned long __init kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { - unsigned long next, last_map_addr = end; - unsigned long start_phys = start, end_phys = end; - printk(KERN_INFO "init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) { - init_gbpages(); - find_early_table_space(end); - } + unsigned long next, last_map_addr = end; start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -645,7 +633,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end)); + last_map_addr = phys_pud_update(pgd, __pa(start), + __pa(end), page_size_mask); continue; } @@ -654,22 +643,61 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon else pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + page_size_mask); unmap_low_page(pud); pgd_populate(&init_mm, pgd_offset_k(start), __va(pud_phys)); } + return last_map_addr; +} +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + unsigned long last_map_addr; + unsigned long page_size_mask = 0; + + printk(KERN_INFO "init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ + if (!after_bootmem) { + init_gbpages(); + find_early_table_space(end); + } + + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; + + last_map_addr = kernel_physical_mapping_init(start, end, + page_size_mask); + if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); - if (!after_bootmem) + if (!after_bootmem && table_end > table_start) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + printk(KERN_INFO "last_map_addr: %lx end: %lx\n", + last_map_addr, end); + if (!after_bootmem) - early_memtest(start_phys, end_phys); + early_memtest(start, end); return last_map_addr >> PAGE_SHIFT; } -- cgit v1.1 From 49c980df552499e5e8595b52448f612fdab0484a Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Thu, 3 Jul 2008 12:29:34 -0700 Subject: x86: fix vmemmap printout check Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: "Nick Piggin" <npiggin@suse.de> Cc: "Mark McLoughlin" <markmc@redhat.com> Cc: xen-devel <xen-devel@lists.xensource.com> Cc: "Eduardo Habkost" <ehabkost@redhat.com> Cc: "Vegard Nossum" <vegard.nossum@gmail.com> Cc: "Stephen Tweedie" <sct@redhat.com> Cc: "Jeremy Fitzhardinge" <jeremy@goop.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7227a0a..f7d8031 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1104,9 +1104,6 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; - /* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start) @@ -1116,6 +1113,9 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) node_start = node; p_start = p; } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else vmemmap_verify((pte_t *)pmd, node, addr, next); } -- cgit v1.1 From 2dc807b37b7b8c7df445513ad2b415df4ebcaf6d Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 8 Jul 2008 18:56:38 -0700 Subject: x86: make max_pfn cover acpi table below 4g When system have 4g less ram installed, and acpi table sit near end of ram, make max_pfn cover them too, so 64bit kernel don't need to mess up fixmap. Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Cc: "Suresh Siddha" <suresh.b.siddha@intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/e820.c | 18 ++++++++++++------ arch/x86/kernel/setup.c | 13 +++---------- include/asm-x86/e820.h | 2 +- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e07d401..2e08619 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1056,12 +1056,20 @@ unsigned long __initdata end_user_pfn = MAX_ARCH_PFN; /* * Find the highest page frame number we have available */ -unsigned long __init e820_end_of_ram(void) +unsigned long __init e820_end(void) { - unsigned long last_pfn; + int i; + unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - last_pfn = find_max_pfn_with_active_regions(); + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long end_pfn; + + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; @@ -1192,9 +1200,7 @@ static int __init parse_memmap_opt(char *p) * the real mem size before original memory map is * reset. */ - e820_register_active_regions(0, 0, -1UL); - saved_max_pfn = e820_end_of_ram(); - remove_all_active_ranges(); + saved_max_pfn = e820_end(); #endif e820.nr_map = 0; userdef = 1; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bea8ae7..a7c3471 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -709,22 +709,18 @@ void __init setup_arch(char **cmdline_p) early_gart_iommu_check(); #endif - e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram(); + max_pfn = e820_end(); /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); /* update e820 for memory not covered by WB MTRRs */ mtrr_bp_init(); - if (mtrr_trim_uncached_memory(max_pfn)) { - remove_all_active_ranges(); - e820_register_active_regions(0, 0, -1UL); - max_pfn = e820_end_of_ram(); - } + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820_end(); #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ @@ -767,9 +763,6 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); - /* Remove active ranges so rediscovery with NUMA-awareness happens */ - remove_all_active_ranges(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index a20d0a7f..78c03d7 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -99,7 +99,7 @@ extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); -extern unsigned long e820_end_of_ram(void); +extern unsigned long e820_end(void); extern int e820_find_active_region(const struct e820entry *ei, unsigned long start_pfn, unsigned long last_pfn, -- cgit v1.1 From c2e6d65bcea2672788f9bb58ce7606c41388387b Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yhlu.kernel@gmail.com> Date: Tue, 8 Jul 2008 01:43:27 -0700 Subject: x86: not overmap more than the end of RAM in init_memory_mapping - 64bit handle head and tail that are not aligned to big pages (2MB/1GB boundary). with this patch, on system that support gbpages, change: last_map_addr: 1080000000 end: 1078000000 to: last_map_addr: 1078000000 end: 1078000000 Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/mm/init_64.c | 77 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f7d8031..51f69b3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -462,18 +462,25 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, static void __init find_early_table_space(unsigned long end) { - unsigned long puds, tables, start; + unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (!direct_gbpages) { - unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - } - if (!cpu_has_pse) { - unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); - } + if (direct_gbpages) { + unsigned long extra; + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (cpu_has_pse) { + unsigned long extra; + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@ -660,8 +667,9 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long last_map_addr; + unsigned long last_map_addr = end; unsigned long page_size_mask = 0; + unsigned long start_pfn, end_pfn; printk(KERN_INFO "init_memory_mapping\n"); @@ -682,8 +690,53 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (cpu_has_pse) page_size_mask |= 1 << PG_LEVEL_2M; - last_map_addr = kernel_physical_mapping_init(start, end, - page_size_mask); + /* head if not big page aligment ?*/ + start_pfn = start >> PAGE_SHIFT; + end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT, 0); + + /* big page (2M) range*/ + start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT, + page_size_mask & (1<<PG_LEVEL_2M)); + + /* big page (1G) range */ + start_pfn = end_pfn; + end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT, + page_size_mask & ((1<<PG_LEVEL_2M) + | (1<<PG_LEVEL_1G))); + + /* tail is not big page (1G) alignment */ + start_pfn = end_pfn; + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT, + page_size_mask & (1<<PG_LEVEL_2M)); + /* tail is not big page (2M) alignment */ + start_pfn = end_pfn; + end_pfn = end>>PAGE_SHIFT; + if (start_pfn < end_pfn) + last_map_addr = kernel_physical_mapping_init( + start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT, 0); if (!after_bootmem) mmu_cr4_features = read_cr4(); -- cgit v1.1 From e2079c43861f71b2deb78ee20e247ad954fdd67e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rjw@sisk.pl> Date: Tue, 8 Jul 2008 16:12:26 +0200 Subject: x86: fix C1E && nx6325 stability problem The problems are that, with the ACPI vs timer overring issue _fixed_, after using the box for some time (between several seconds and 1 hour, at random) processes get very high CPU loads (once I've got X using 107% of the CPU, for example) and the system becomes unresponsive, as though there were interrupts lost or something similar. Andreas Herrman reproduced similar problems: > Ok, now I've reproduced the stability problem. > - Using tip/master, > - reverting e38502eb8aa82314d5ab0eba45f50e6790dadd88 and > - applying your patch from this posting > http://marc.info/?l=linux-kernel&m=121539354224562&w=4 > > Starting X, firefox, gimp, tuxpaint and doing some drawing in tuxpaint > results in a slow system. Drawing is almost not possible anymore -- > Selections of new colors, cursors etc. is performed with huge delay > if it's performed at all. > > BTW, the code sets up timer IRQ as Virtual Wire IRQ: > > Jul 8 14:57:58 kodscha IO-APIC (apicid-pin) 2-22, 2-23 not connected. > Jul 8 14:57:58 kodscha ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 > Jul 8 14:57:58 kodscha ...trying to set up timer as Virtual Wire IRQ... works. > > and both INT0 and INT2 of IOAPIC are masked: > > Jul 8 14:57:58 kodscha NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect: > Jul 8 14:57:58 kodscha 00 000 1 0 0 0 0 0 0 00 > Jul 8 14:57:58 kodscha 01 003 0 0 0 0 0 1 1 31 > Jul 8 14:57:58 kodscha 02 003 1 0 0 0 0 0 0 30 > > I've also seen strange CPU utilization -- with syslog-ng: > > top - 15:33:06 up 35 min, 4 users, load average: 1.70, 0.68, 0.37 > Tasks: 64 total, 4 running, 60 sleeping, 0 stopped, 0 zombie > Cpu0 : 0.0%us,100.0%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st > Cpu1 : 6.4%us, 87.2%sy, 0.0%ni, 5.8%id, 0.0%wa, 0.6%hi, 0.0%si, 0.0%st > Mem: 895384k total, 283568k used, 611816k free, 35492k buffers > Swap: 1959920k total, 0k used, 1959920k free, 163044k cached > > PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND > 4632 root 20 0 17216 800 580 S 104 0.1 0:34.22 syslog-ng > 28505 root 20 0 205m 11m 4024 S 6 1.3 0:21.16 X > 28518 root 20 0 56292 5652 4492 S 1 0.6 0:01.80 fluxbox > 1 root 20 0 3724 608 508 S 0 0.1 0:00.36 init > > So far I have no clue why C1E-idle in conjunction with virtual wire > mode causes this strange behaviour. > > ... and I start to think about the root cause of all this. > > I've performed similar tests under X with the IRQ0/INT0 configuration and > I did not see above symptoms. So lets fall back to the IRQ0/INT0 configuration on this box. This basically restores the dont-use-the-lapic-timer exception mechanism that was unconditional on this box prior commit 8750bf5 ("x86: add C1E aware idle function"). Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 43 +++++++++++++++++++++++++++++++++++++------ arch/x86/kernel/io_apic_32.c | 10 ++++++++++ arch/x86/kernel/io_apic_64.c | 10 ++++++++++ include/asm-x86/genapic_32.h | 1 + include/asm-x86/genapic_64.h | 2 ++ 5 files changed, 60 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5c01076..e1f0139 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1373,8 +1373,6 @@ static void __init acpi_process_madt(void) return; } -#ifdef __i386__ - static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { @@ -1436,6 +1434,17 @@ dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d) } /* + * Force ignoring BIOS IRQ0 pin2 override + */ +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) +{ + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); + acpi_skip_timer_override = 1; + force_mask_ioapic_irq_2(); + return 0; +} + +/* * If your system is blacklisted here, but you find that acpi=force * works for you, please contact acpi-devel@sourceforge.net */ @@ -1628,11 +1637,35 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), }, }, + /* + * HP laptops which use a DSDT reporting as HP/SB400/10000, + * which includes some code which overrides all temperature + * trip points to 16C if the INTIN2 input of the I/O APIC + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 + * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * override in that cases. + */ + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6125 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6325 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, {} }; -#endif /* __i386__ */ - /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. @@ -1660,9 +1693,7 @@ int __init acpi_boot_table_init(void) { int error; -#ifdef __i386__ dmi_check_system(acpi_dmi_table); -#endif /* * If acpi_disabled, bail out diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 337ec34..6b220b9 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -59,6 +59,13 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); +static bool mask_ioapic_irq_2 __initdata; + +void __init force_mask_ioapic_irq_2(void) +{ + mask_ioapic_irq_2 = true; +} + int timer_through_8259 __initdata; /* @@ -2172,6 +2179,9 @@ static inline void __init check_timer(void) printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); + if (mask_ioapic_irq_2) + mask_IO_APIC_irq(2); + /* * Some BIOS writers are clueless and report the ExtINTA * I/O APIC input from the cascaded 8259A as the timer diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 2b4c40b..0494cdb 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -94,6 +94,13 @@ static int no_timer_check; static int disable_timer_pin_1 __initdata; +static bool mask_ioapic_irq_2 __initdata; + +void __init force_mask_ioapic_irq_2(void) +{ + mask_ioapic_irq_2 = true; +} + int timer_through_8259 __initdata; /* Where if anywhere is the i8259 connect in external int mode */ @@ -1698,6 +1705,9 @@ static inline void __init check_timer(void) apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", cfg->vector, apic1, pin1, apic2, pin2); + if (mask_ioapic_irq_2) + mask_IO_APIC_irq(2); + /* * Some BIOS writers are clueless and report the ExtINTA * I/O APIC input from the cascaded 8259A as the timer diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h index b02ea6e..8d4c8bd 100644 --- a/include/asm-x86/genapic_32.h +++ b/include/asm-x86/genapic_32.h @@ -119,5 +119,6 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define is_uv_system() 0 #define uv_wakeup_secondary(a, b) 1 +extern void force_mask_ioapic_irq_2(void); #endif diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h index 0f85046..082ad02 100644 --- a/include/asm-x86/genapic_64.h +++ b/include/asm-x86/genapic_64.h @@ -46,4 +46,6 @@ extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip); extern void setup_apic_routing(void); +extern void force_mask_ioapic_irq_2(void); + #endif -- cgit v1.1 From 183fe065652dbd64953afa9f389327e23e97967f Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Wed, 9 Jul 2008 11:32:10 +0200 Subject: x86: build fix for "x86: fix C1E && nx6325 stability problem" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/acpi/boot.c: In function ‘dmi_ignore_irq0_timer_override’: arch/x86/kernel/acpi/boot.c:1443: error: implicit declaration of function ‘force_mask_ioapic_irq_2’ Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/acpi/boot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e1f0139..bf7b4f7 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -37,6 +37,7 @@ #include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> +#include <asm/genapic.h> #include <asm/io.h> #include <asm/mpspec.h> #include <asm/smp.h> -- cgit v1.1