diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-05-05 14:55:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-05-05 14:55:20 -0700 |
commit | ea62ccd00fd0b6720b033adfc9984f31130ce195 (patch) | |
tree | 9837b797b2466fffcb0af96c388b06eae9c3df18 /arch/i386/kernel | |
parent | 886a0768affe9a32f18c45f8e1393bca9ece5392 (diff) | |
parent | 35060b6a9a4e1c89bc6fbea61090e302dbc61847 (diff) | |
download | kernel_samsung_espresso10-ea62ccd00fd0b6720b033adfc9984f31130ce195.zip kernel_samsung_espresso10-ea62ccd00fd0b6720b033adfc9984f31130ce195.tar.gz kernel_samsung_espresso10-ea62ccd00fd0b6720b033adfc9984f31130ce195.tar.bz2 |
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (231 commits)
[PATCH] i386: Don't delete cpu_devs data to identify different x86 types in late_initcall
[PATCH] i386: type may be unused
[PATCH] i386: Some additional chipset register values validation.
[PATCH] i386: Add missing !X86_PAE dependincy to the 2G/2G split.
[PATCH] x86-64: Don't exclude asm-offsets.c in Documentation/dontdiff
[PATCH] i386: avoid redundant preempt_disable in __unlazy_fpu
[PATCH] i386: white space fixes in i387.h
[PATCH] i386: Drop noisy e820 debugging printks
[PATCH] x86-64: Fix allnoconfig error in genapic_flat.c
[PATCH] x86-64: Shut up warnings for vfat compat ioctls on other file systems
[PATCH] x86-64: Share identical video.S between i386 and x86-64
[PATCH] x86-64: Remove CONFIG_REORDER
[PATCH] x86-64: Print type and size correctly for unknown compat ioctls
[PATCH] i386: Remove copy_*_user BUG_ONs for (size < 0)
[PATCH] i386: Little cleanups in smpboot.c
[PATCH] x86-64: Don't enable NUMA for a single node in K8 NUMA scanning
[PATCH] x86: Use RDTSCP for synchronous get_cycles if possible
[PATCH] i386: Add X86_FEATURE_RDTSCP
[PATCH] i386: Implement X86_FEATURE_SYNC_RDTSC on i386
[PATCH] i386: Implement alternative_io for i386
...
Fix up trivial conflict in include/linux/highmem.h manually.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/i386/kernel')
54 files changed, 2406 insertions, 2696 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 4ae3dcf..4f98516 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_VMI) += vmi.o vmitime.o +obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += pcspeaker.o -EXTRA_AFLAGS := -traditional - obj-$(CONFIG_SCx200) += scx200.o # vsyscall.o contains the vsyscall DSO images as __initdata. diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 9ea5b8e..280898b 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -874,7 +874,7 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - clustered_apic_check(); + setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 8f7efd3..23f78ef 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,7 +10,6 @@ #include <asm/pci-direct.h> #include <asm/acpi.h> #include <asm/apic.h> -#include <asm/irq.h> #ifdef CONFIG_ACPI @@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device) return 0; } -static void check_intel(void) -{ - u16 vendor, device; - - vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); - - if (vendor != PCI_VENDOR_ID_INTEL) - return; - - device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - void __init check_acpi_pci(void) { int num, slot, func; @@ -77,8 +58,6 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; - check_intel(); - /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 426f59b..e5cec66 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -5,6 +5,7 @@ #include <asm/alternative.h> #include <asm/sections.h> +static int noreplace_smp = 0; static int smp_alt_once = 0; static int debug_alternative = 0; @@ -13,15 +14,33 @@ static int __init bootonly(char *str) smp_alt_once = 1; return 1; } +__setup("smp-alt-boot", bootonly); + static int __init debug_alt(char *str) { debug_alternative = 1; return 1; } - -__setup("smp-alt-boot", bootonly); __setup("debug-alternative", debug_alt); +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#ifdef CONFIG_PARAVIRT +static int noreplace_paravirt = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); +#endif + #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) @@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; -extern u8 __smp_alt_begin[], __smp_alt_end[]; - /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where @@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #ifdef CONFIG_SMP -static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end); - for (a = start; a < end; a++) { - memcpy(a->replacement + a->replacementlen, - a->instr, - a->instrlen); - } -} - -static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) { - memcpy(a->instr, - a->replacement + a->replacementlen, - a->instrlen); - } -} - static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; @@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end { u8 **ptr; + if (noreplace_smp) + return; + for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name, struct smp_alt_module *smp; unsigned long flags; + if (noreplace_smp) + return; + if (smp_alt_once) { if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(locks, locks_end, @@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod) struct smp_alt_module *item; unsigned long flags; - if (smp_alt_once) + if (smp_alt_once || noreplace_smp) return; spin_lock_irqsave(&smp_alt, flags); @@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp) return; #endif - if (smp_alt_once) + if (noreplace_smp || smp_alt_once) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - alternatives_smp_apply(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { - struct paravirt_patch *p; + struct paravirt_patch_site *p; + + if (noreplace_paravirt) + return; for (p = start; p < end; p++) { unsigned int used; used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, p->len); -#ifdef CONFIG_DEBUG_PARAVIRT - { - int i; - /* Deliberately clobber regs using "not %reg" to find bugs. */ - for (i = 0; i < 3; i++) { - if (p->len - used >= 2 && (p->clobbers & (1 << i))) { - memcpy(p->instr + used, "\xf7\xd0", 2); - p->instr[used+1] |= i; - used += 2; - } - } - } -#endif + + BUG_ON(used > p->len); + /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - /* Sync to be conservative, in case we patched following instructions */ + /* Sync to be conservative, in case we patched following + * instructions */ sync_core(); } -extern struct paravirt_patch __start_parainstructions[], +extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ @@ -396,23 +386,19 @@ void __init alternative_instructions(void) printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); - apply_alternatives(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } free_init_pages("SMP alternatives", - (unsigned long)__smp_alt_begin, - (unsigned long)__smp_alt_end); + __pa_symbol(&__smp_locks), + __pa_symbol(&__smp_locks_end)); } else { - alternatives_smp_save(__smp_alt_instructions, - __smp_alt_instructions_end); alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); alternatives_smp_switch(0); } #endif - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 93aa911..aca054c 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -129,6 +129,28 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned long safe_apic_wait_icr_idle(void) +{ + unsigned long send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 064bbf2..367ff1d 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -233,11 +233,10 @@ #include <asm/desc.h> #include <asm/i8253.h> #include <asm/paravirt.h> +#include <asm/reboot.h> #include "io_ports.h" -extern void machine_real_restart(unsigned char *, int); - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif @@ -384,13 +383,6 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -#ifdef CONFIG_APM_RTC_IS_GMT -# define clock_cmos_diff 0 -# define got_clock_diff 1 -#else -static long clock_cmos_diff; -static int got_clock_diff; -#endif static int debug __read_mostly; static int smp __read_mostly; static int apm_disabled = -1; diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c375351..27a776c 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -11,11 +11,11 @@ #include <linux/suspend.h> #include <asm/ucontext.h> #include "sigframe.h" +#include <asm/pgtable.h> #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/thread_info.h> #include <asm/elf.h> -#include <asm/pda.h> #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -25,6 +25,9 @@ #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)); +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + void foo(void) { OFFSET(SIGCONTEXT_eax, sigcontext, eax); @@ -90,17 +93,18 @@ void foo(void) OFFSET(pbe_next, pbe, next); /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - + DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VDSO_PRELINK, VDSO_PRELINK); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); - BLANK(); - OFFSET(PDA_cpu, i386_pda, cpu_number); - OFFSET(PDA_pcurrent, i386_pda, pcurrent); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); #ifdef CONFIG_PARAVIRT BLANK(); diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 010aecf..74f27a4 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -2,7 +2,7 @@ # Makefile for x86-compatible CPU details and quirks # -obj-y := common.o proc.o +obj-y := common.o proc.o bugs.o obj-y += amd.o obj-y += cyrix.o @@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 2d47db4..4fec702 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +int force_mwait __cpuinitdata; + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); + + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) @@ -314,13 +319,3 @@ int __init amd_init_cpu(void) cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; return 0; } - -//early_arch_initcall(amd_init_cpu); - -static int __init amd_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_AMD] = NULL; - return 0; -} - -late_initcall(amd_exit_cpu); diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c new file mode 100644 index 0000000..54428a2 --- /dev/null +++ b/arch/i386/kernel/cpu/bugs.c @@ -0,0 +1,191 @@ +/* + * arch/i386/cpu/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * <rreilova@ececs.uc.edu> + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include <linux/init.h> +#include <linux/utsname.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/paravirt.h> +#include <asm/alternative.h> + +static int __init no_halt(char *s) +{ + boot_cpu_data.hlt_works_ok = 0; + return 1; +} + +__setup("no-hlt", no_halt); + +static int __init mca_pentium(char *s) +{ + mca_pentium_flag = 1; + return 1; +} + +__setup("mca-pentium", mca_pentium); + +static int __init no_387(char *s) +{ + boot_cpu_data.hard_math = 0; + write_cr0(0xE | read_cr0()); + return 1; +} + +__setup("no387", no_387); + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + if (!boot_cpu_data.hard_math) { +#ifndef CONFIG_MATH_EMULATION + printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); + printk(KERN_EMERG "Giving up.\n"); + for (;;) ; +#endif + return; + } + +/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */ + /* Test for the divl bug.. */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&boot_cpu_data.fdiv_bug) + : "m" (*&x), "m" (*&y)); + if (boot_cpu_data.fdiv_bug) + printk("Hmm, FPU with FDIV bug.\n"); +} + +static void __init check_hlt(void) +{ + if (paravirt_enabled()) + return; + + printk(KERN_INFO "Checking 'hlt' instruction... "); + if (!boot_cpu_data.hlt_works_ok) { + printk("disabled\n"); + return; + } + halt(); + halt(); + halt(); + halt(); + printk("OK.\n"); +} + +/* + * Most 386 processors have a bug where a POPAD can lock the + * machine even from user space. + */ + +static void __init check_popad(void) +{ +#ifndef CONFIG_X86_POPAD_OK + int res, inp = (int) &res; + + printk(KERN_INFO "Checking for popad bug... "); + __asm__ __volatile__( + "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " + : "=&a" (res) + : "d" (inp) + : "ecx", "edi" ); + /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ + if (res != 12345678) printk( "Buggy.\n" ); + else printk( "OK.\n" ); +#endif +} + +/* + * Check whether we are able to run this kernel safely on SMP. + * + * - In order to run on a i386, we need to be compiled for i386 + * (for due to lack of "invlpg" and working WP on a i386) + * - In order to run on anything without a TSC, we need to be + * compiled for a i486. + * - In order to support the local APIC on a buggy Pentium machine, + * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, + * which happens implicitly if compiled for a Pentium or lower + * (unless an advanced selection of CPU features is used) as an + * otherwise config implies a properly working local APIC without + * the need to do extra reads from the APIC. +*/ + +static void __init check_config(void) +{ +/* + * We'd better not be a i386 if we're configured to use some + * i486+ only features! (WP works in supervisor mode and the + * new "invlpg" and "bswap" instructions) + */ +#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP) + if (boot_cpu_data.x86 == 3) + panic("Kernel requires i486+ for 'invlpg' and other features"); +#endif + +/* + * If we configured ourselves for a TSC, we'd better have one! + */ +#ifdef CONFIG_X86_TSC + if (!cpu_has_tsc && !tsc_disable) + panic("Kernel compiled for Pentium+, requires TSC feature!"); +#endif + +/* + * If we were told we had a good local APIC, check for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL + && cpu_has_apic + && boot_cpu_data.x86 == 5 + && boot_cpu_data.x86_model == 2 + && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) + panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); +#endif +} + + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#ifndef CONFIG_SMP + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + check_config(); + check_fpu(); + check_hlt(); + check_popad(); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + alternative_instructions(); +} diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index 8c25047..473eac8 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -469,13 +469,3 @@ int __init centaur_init_cpu(void) cpu_devs[X86_VENDOR_CENTAUR] = ¢aur_cpu_dev; return 0; } - -//early_arch_initcall(centaur_init_cpu); - -static int __init centaur_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CENTAUR] = NULL; - return 0; -} - -late_initcall(centaur_exit_cpu); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index dcbbd0a..794d593 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,15 +18,37 @@ #include <asm/apic.h> #include <mach_apic.h> #endif -#include <asm/pda.h> #include "cpu.h" -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(_cpu_pda); + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; @@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_init(c); +} - if (c == &boot_cpu_data) - sysenter_setup(); +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + sysenter_setup(); enable_sep_cpu(); + mtrr_bp_init(); +} - if (c == &boot_cpu_data) - mtrr_bp_init(); - else - mtrr_ap_init(); +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + enable_sep_cpu(); + mtrr_ap_init(); } #ifdef CONFIG_X86_HT @@ -601,129 +630,36 @@ void __init early_cpu_init(void) #endif } -/* Make sure %gs is initialized properly in idle threads */ +/* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PDA; + regs->xfs = __KERNEL_PERCPU; return regs; } -static __cpuinit int alloc_gdt(int cpu) +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) { - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - /* - * This is a horrible hack to allocate the GDT. The problem - * is that cpu_init() is called really early for the boot CPU - * (and hence needs bootmem) but much later for the secondary - * CPUs, when bootmem will have gone away - */ - if (NODE_DATA(0)->bdata->node_bootmem_map) { - BUG_ON(gdt != NULL || pda != NULL); - - gdt = alloc_bootmem_pages(PAGE_SIZE); - pda = alloc_bootmem(sizeof(*pda)); - /* alloc_bootmem(_pages) panics on failure, so no check */ - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ - if (gdt == NULL) - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - - if (pda == NULL) - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); - - if (unlikely(!gdt || !pda)) { - free_pages((unsigned long)gdt, 0); - kfree(pda); - return 0; - } - } - - cpu_gdt_descr->address = (unsigned long)gdt; - cpu_pda(cpu) = pda; - - return 1; -} + struct Xgt_desc_struct gdt_descr; -/* Initial PDA used by boot CPU */ -struct i386_pda boot_pda = { - ._pda = &boot_pda, - .cpu_number = 0, - .pcurrent = &init_task, -}; - -static inline void set_kernel_fs(void) -{ - /* Set %fs for this CPU's PDA. Memory clobber is to create a - barrier with respect to any PDA operations, so the compiler - doesn't move any before here. */ - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); } -/* Initialize the CPU's GDT and PDA. The boot CPU does this for - itself, but secondaries find this done for them. */ -__cpuinit int init_gdt(int cpu, struct task_struct *idle) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt; - struct i386_pda *pda; - - /* For non-boot CPUs, the GDT and PDA should already have been - allocated. */ - if (!alloc_gdt(cpu)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); - return 0; - } - - gdt = (struct desc_struct *)cpu_gdt_descr->address; - pda = cpu_pda(cpu); - - BUG_ON(gdt == NULL || pda == NULL); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->size = GDT_SIZE - 1; - - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ - - memset(pda, 0, sizeof(*pda)); - pda->_pda = pda; - pda->cpu_number = cpu; - pda->pcurrent = idle; - - return 1; -} - -void __cpuinit cpu_set_gdt(int cpu) -{ - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - - /* Reinit these anyway, even if they've already been done (on - the boot CPU, this will transition from the boot gdt+pda to - the real ones). */ - load_gdt(cpu_gdt_descr); - set_kernel_fs(); -} - -/* Common CPU init for both boot and secondary CPUs */ -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) { + int cpu = smp_processor_id(); + struct task_struct *curr = current; struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; @@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) } load_idt(&idt_descr); + switch_to_new_gdt(); /* * Set up and load the per-CPU TSS and LDT @@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) mxcsr_feature_mask_init(); } -/* Entrypoint to initialize secondary CPU */ -void __cpuinit secondary_cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - _cpu_init(cpu, curr); -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - - /* Set up the real GDT and PDA, so we can transition from the - boot versions. */ - if (!init_gdt(cpu, curr)) { - /* failed to allocate something; not much we can do... */ - for (;;) - local_irq_enable(); - } - - cpu_set_gdt(cpu); - _cpu_init(cpu, curr); -} - #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index de27bd0..0b8411a 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if (vendor == PCI_VENDOR_ID_CYRIX && (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) - pit_latch_buggy = 1; + mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ @@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void) return 0; } -//early_arch_initcall(cyrix_init_cpu); - -static int __init cyrix_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_CYRIX] = NULL; - return 0; -} - -late_initcall(cyrix_exit_cpu); - static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, @@ -470,12 +460,3 @@ int __init nsc_init_cpu(void) return 0; } -//early_arch_initcall(nsc_init_cpu); - -static int __init nsc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NSC] = NULL; - return 0; -} - -late_initcall(nsc_exit_cpu); diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 56fe265..dc4e081 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - if (c->x86 == 15) + if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); + set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); + } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); if ((c->x86 == 0xf && c->x86_model >= 0x03) || diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index b0862af..f9fa414 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) machine_check_vector = k7_machine_check; wmb(); + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + printk (KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ @@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c) nr_mce_banks = l & 0xff; /* Clear status for MC index 0 separately, we don't touch CTL, - * as some Athlons cause spurious MCEs when its enabled. */ - wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); - for (i=1; i<nr_mce_banks; i++) { + * as some K7 Athlons cause spurious MCEs when its enabled. */ + if (boot_cpu_data.x86 == 6) { + wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + i = 1; + } else + i = 0; + for (; i<nr_mce_banks; i++) { wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); } diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 4f10c62..56cd485 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_AMD: - if (c->x86==6 || c->x86==15) - amd_mcheck_init(c); + amd_mcheck_init(c); break; case X86_VENDOR_INTEL: diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a..1509edf 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) { u32 h; - if (mce_num_extended_msrs == 0) - goto done; - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); @@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_ESP, r->esp, h); rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); rdmsr (MSR_IA32_MCG_EIP, r->eip, h); - - /* can we rely on kmalloc to do a dynamic - * allocation for the reserved registers? - */ -done: - return mce_num_extended_msrs; } static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) @@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - struct intel_mce_extended_msrs dbg; rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ @@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - if (intel_get_extended_msrs(&dbg)) { + if (mce_num_extended_msrs > 0) { + struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", smp_processor_id(), dbg.eip, dbg.eflags); printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f77fc53..5367e32 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -20,13 +20,25 @@ struct mtrr_state { mtrr_type def_type; }; +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + {} +}; + static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." -static __initdata int mtrr_show; +static int mtrr_show; module_param_named(show, mtrr_show, bool, 0); /* Get the MSR pair relating to a var range */ @@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -static void __init +static void get_fixed_ranges(mtrr_type * frs) { unsigned int *p = (unsigned int *) frs; @@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } -static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) +void mtrr_save_fixed_ranges(void *info) +{ + get_fixed_ranges(mtrr_state.fixed_ranges); +} + +static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types) { unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) - printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); + printk(KERN_INFO "MTRR %05X-%05X %s\n", + base, base + step - 1, mtrr_attrib_to_str(*types)); } /* Grab all of the MTRR state for this CPU into *state */ @@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } +/** + * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs + * see AMD publication no. 24593, chapter 3.2.1 for more information + */ +static inline void k8_enable_fixed_iorrs(void) +{ + unsigned lo, hi; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_K8_SYSCFG, lo + | K8_MTRRFIXRANGE_DRAM_ENABLE + | K8_MTRRFIXRANGE_DRAM_MODIFY, hi); +} + +/** + * Checks and updates an fixed-range MTRR if it differs from the value it + * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also. + * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information + * \param msr MSR address of the MTTR which should be checked and updated + * \param changed pointer which indicates whether the MTRR needed to be changed + * \param msrwords pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 15 && + ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) + k8_enable_fixed_iorrs(); + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = TRUE; + } +} + int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. <base> The starting (base) address of the region. @@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, *type = base_lo & 0xff; } +/** + * Checks and updates the fixed-range MTRRs if they differ from the saved set + * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ static int set_fixed_ranges(mtrr_type * frs) { - unsigned int *p = (unsigned int *) frs; + unsigned long long *saved = (unsigned long long *) frs; int changed = FALSE; - int i; - unsigned int lo, hi; + int block=-1, range; - rdmsr(MTRRfix64K_00000_MSR, lo, hi); - if (p[0] != lo || p[1] != hi) { - mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); - changed = TRUE; - } + while (fixed_range_blocks[++block].ranges) + for (range=0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *) saved++); - for (i = 0; i < 2; i++) { - rdmsr(MTRRfix16K_80000_MSR + i, lo, hi); - if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], - p[3 + i * 2]); - changed = TRUE; - } - } - - for (i = 0; i < 8; i++) { - rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi); - if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { - mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], - p[7 + i * 2]); - changed = TRUE; - } - } return changed; } @@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base + size < 0x100) { + if (base < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 0acfb6a..02a2f39 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -729,6 +729,17 @@ void mtrr_ap_init(void) local_irq_restore(flags); } +/** + * Save current fixed-range MTRR state of the BSP + */ +void mtrr_save_state(void) +{ + if (smp_processor_id() == 0) + mtrr_save_fixed_ranges(NULL); + else + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index 8bf23cc..961fbe1 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void) cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; return 0; } - -//early_arch_initcall(nexgen_init_cpu); - -static int __init nexgen_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_NEXGEN] = NULL; - return 0; -} - -late_initcall(nexgen_exit_cpu); diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 0000000..2b04c8f --- /dev/null +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,658 @@ +/* local apic based NMI watchdog for various CPUs. + This file also handles reservation of performance counters for coordination + with other users (like oprofile). + + Note that these events normally don't tick when the CPU idles. This means + the frequency varies with CPU load. + + Original code for K7/P6 written by Keith Owens */ + +#include <linux/percpu.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/apic.h> +#include <asm/intel_arch_perfmon.h> + +struct nmi_watchdog_ctlblk { + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; + +/* Interface defining a CPU specific perfctr watchdog */ +struct wd_ops { + int (*reserve)(void); + void (*unreserve)(void); + int (*setup)(unsigned nmi_hz); + void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); + void (*stop)(void *); + unsigned perfctr; + unsigned evntsel; + u64 checkbit; +}; + +static struct wd_ops *wd_ops; + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->perfctr : 0; +} + +/* converts an msr to an appropriate reservation bit */ +/* returns the bit offset of the event selection register */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + return wd_ops ? msr - wd_ops->evntsel : 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, perfctr_nmi_owner)); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, perfctr_nmi_owner); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, evntsel_nmi_owner); +} + +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); + +void disable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) + return; + + on_each_cpu(wd_ops->stop, NULL, 0, 1); + wd_ops->unreserve(); + + BUG_ON(atomic_read(&nmi_active) != 0); +} + +void enable_lapic_nmi_watchdog(void) +{ + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (!wd_ops) + return; + if (!wd_ops->reserve()) { + printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); + return; + } + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); +} + +/* + * Activate the NMI watchdog via the local APIC. + */ + +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + +static void +write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsrl(perfctr_msr, 0 - count); +} + +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr, unsigned nmi_hz) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + +/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface + nicely stable so there is not much variety */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +static int setup_k7_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void single_msr_stop_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int single_msr_reserve(void) +{ + if (!reserve_perfctr_nmi(wd_ops->perfctr)) + return 0; + + if (!reserve_evntsel_nmi(wd_ops->evntsel)) { + release_perfctr_nmi(wd_ops->perfctr); + return 0; + } + return 1; +} + +static void single_msr_unreserve(void) +{ + release_evntsel_nmi(wd_ops->perfctr); + release_perfctr_nmi(wd_ops->evntsel); +} + +static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops k7_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_k7_watchdog, + .rearm = single_msr_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_K7_PERFCTR0, + .evntsel = MSR_K7_EVNTSEL0, + .checkbit = 1ULL<<63, +}; + +/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +static int setup_p6_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = P6_EVNTSEL_INT + | P6_EVNTSEL_OS + | P6_EVNTSEL_USR + | P6_NMI_EVENT; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + return 1; +} + +static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + /* P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); +} + +static struct wd_ops p6_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_p6_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_P6_PERFCTR0, + .evntsel = MSR_P6_EVNTSEL0, + .checkbit = 1ULL<<39, +}; + +/* Intel P4 performance counters. By far the most complicated of all. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) + +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + +static int setup_p4_watchdog(unsigned nmi_hz) +{ + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; + unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) + return 0; + +#ifdef CONFIG_SMP + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else +#endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } + + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + return 1; +} + +static void stop_p4_watchdog(void *arg) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); +} + +static int p4_reserve(void) +{ + if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) + return 0; +#ifdef CONFIG_SMP + if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) + goto fail1; +#endif + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail2; + /* RED-PEN why is ESCR1 not reserved here? */ + return 1; + fail2: +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); + fail1: +#endif + release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); + return 0; +} + +static void p4_unreserve(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings > 1) + release_evntsel_nmi(MSR_P4_IQ_PERFCTR1); +#endif + release_evntsel_nmi(MSR_P4_IQ_PERFCTR0); + release_perfctr_nmi(MSR_P4_CRU_ESCR0); +} + +static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +{ + unsigned dummy; + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); +} + +static struct wd_ops p4_wd_ops = { + .reserve = p4_reserve, + .unreserve = p4_unreserve, + .setup = setup_p4_watchdog, + .rearm = p4_rearm, + .stop = stop_p4_watchdog, + /* RED-PEN this is wrong for the other sibling */ + .perfctr = MSR_P4_BPU_PERFCTR0, + .evntsel = MSR_P4_BSU_ESCR0, + .checkbit = 1ULL<<39, +}; + +/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully + all future Intel CPUs. */ + +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(unsigned nmi_hz) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return 0; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1); + return 1; +} + +static struct wd_ops intel_arch_wd_ops = { + .reserve = single_msr_reserve, + .unreserve = single_msr_unreserve, + .setup = setup_intel_arch_watchdog, + .rearm = p6_rearm, + .stop = single_msr_stop_watchdog, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, +}; + +static void probe_nmi_watchdog(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) + return; + wd_ops = &k7_wd_ops; + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + wd_ops = &intel_arch_wd_ops; + break; + } + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + wd_ops = &p6_wd_ops; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; + + wd_ops = &p4_wd_ops; + break; + default: + return; + } + break; + } +} + +/* Interface to nmi.c */ + +int lapic_watchdog_init(unsigned nmi_hz) +{ + if (!wd_ops) { + probe_nmi_watchdog(); + if (!wd_ops) + return -1; + } + + if (!(wd_ops->setup(nmi_hz))) { + printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", + raw_smp_processor_id()); + return -1; + } + + return 0; +} + +void lapic_watchdog_stop(void) +{ + if (wd_ops) + wd_ops->stop(NULL); +} + +unsigned lapic_adjust_nmi_hz(unsigned hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) + hz = adjust_for_32bit_ctr(hz); + return hz; +} + +int lapic_wd_event(unsigned nmi_hz) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 ctr; + rdmsrl(wd->perfctr_msr, ctr); + if (ctr & wd_ops->checkbit) { /* perfctr still running? */ + return 0; + } + wd_ops->rearm(wd, nmi_hz); + return 1; +} + +int lapic_watchdog_ok(void) +{ + return wd_ops != NULL; +} diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 47e3ebb..89d91e6 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, - NULL, /* constant_tsc - moved to flags */ + "", /* constant_tsc - moved to flags */ /* nothing */ }; struct cpuinfo_x86 *c = v; diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index 9317f74..50076f2 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -50,12 +50,3 @@ int __init rise_init_cpu(void) return 0; } -//early_arch_initcall(rise_init_cpu); - -static int __init rise_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = NULL; - return 0; -} - -late_initcall(rise_exit_cpu); diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 5678d46..6471a5a 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void) cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; return 0; } - -//early_arch_initcall(transmeta_init_cpu); - -static int __init transmeta_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_TRANSMETA] = NULL; - return 0; -} - -late_initcall(transmeta_exit_cpu); diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 1bf3f87..a7a4e75 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -24,13 +24,3 @@ int __init umc_init_cpu(void) cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; return 0; } - -//early_arch_initcall(umc_init_cpu); - -static int __init umc_exit_cpu(void) -{ - cpu_devs[X86_VENDOR_UMC] = NULL; - return 0; -} - -late_initcall(umc_exit_cpu); diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c index b4d14c2..265c559 100644 --- a/arch/i386/kernel/doublefault.c +++ b/arch/i386/kernel/doublefault.c @@ -33,7 +33,7 @@ static void doublefault_fn(void) printk("double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct tss_struct *t = (struct tss_struct *)tss; + struct i386_hw_tss *t = (struct i386_hw_tss *)tss; printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); @@ -49,18 +49,21 @@ static void doublefault_fn(void) } struct tss_struct doublefault_tss __cacheline_aligned = { - .esp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + .x86_tss = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, - .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ - .esp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, + .eip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .eflags = X86_EFLAGS_SF | 0x2, + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __pa(swapper_pg_dir) + } }; diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 70f3956..9645bb51 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { { static int __init romsignature(const unsigned char *rom) { + const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address((const unsigned short *)rom, sig) == 0 && - sig == ROMSIGNATURE; + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; } -static int __init romchecksum(unsigned char *rom, unsigned long length) +static int __init romchecksum(const unsigned char *rom, unsigned long length) { - unsigned char sum; + unsigned char sum, c; - for (sum = 0; length; length--) - sum += *rom++; - return sum == 0; + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; } static void __init probe_roms(void) { + const unsigned char *rom; unsigned long start, length, upper; - unsigned char *rom; - int i; + unsigned char c; + int i; /* video rom */ upper = adapter_rom_resources[0].start; @@ -191,8 +192,11 @@ static void __init probe_roms(void) video_rom_resource.start = start; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* if checksum okay, trust length byte */ if (length && romchecksum(rom, length)) @@ -226,8 +230,11 @@ static void __init probe_roms(void) if (!romsignature(rom)) continue; + if (probe_kernel_address(rom + 2, c) != 0) + continue; + /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; + length = c * 512; /* but accept any length that fits if checksum okay */ if (!length || start + length > upper || !romchecksum(rom, length)) @@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) ____________________33__ ______________________4_ */ - printk("sanitize start\n"); /* if there's only one memory region, don't bother */ if (*pnr_map < 2) { - printk("sanitize bail 0\n"); return -1; } @@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) /* bail out if we find any unreasonable addresses in bios map */ for (i=0; i<old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { - printk("sanitize bail 1\n"); return -1; } @@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); *pnr_map = new_nr; - printk("sanitize end\n"); return 0; } @@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) unsigned long long size = biosmap->size; unsigned long long end = start + size; unsigned long type = biosmap->type; - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) @@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) * Not right. Fix it up. */ if (type == E820_RAM) { - printk("copy_e820_map() type is E820_RAM\n"); if (start < 0x100000ULL && end > 0xA0000ULL) { - printk("copy_e820_map() lies in range...\n"); - if (start < 0xA0000ULL) { - printk("copy_e820_map() start < 0xA0000ULL\n"); + if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); - } - if (end <= 0x100000ULL) { - printk("copy_e820_map() end <= 0x100000ULL\n"); + if (end <= 0x100000ULL) continue; - } start = 0x100000ULL; size = end - start; } @@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size) print_memory_map("limit_regions endfunc"); } +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + /* * This function checks if the entire range <start,end> is mapped with type. * diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 8f9c624..dd9e7fa 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct *cpu_gdt_descr; + struct Xgt_desc_struct gdt_descr; spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); - cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); - /* * If I don't have PSE, I should just duplicate two entries in page * directory. If I have PSE, I just need to duplicate one entry in @@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) */ local_flush_tlb(); - cpu_gdt_descr->address = __pa(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); } static void efi_call_phys_epilog(void) __releases(efi_rt_lock) { unsigned long cr4; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); + struct Xgt_desc_struct gdt_descr; - cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); - load_gdt(cpu_gdt_descr); + gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); cr4 = read_cr4(); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 18bddcb..b1f16ee 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -15,7 +15,7 @@ * I changed all the .align's to 4 (16 byte alignment), as that's faster * on a 486. * - * Stack layout in 'ret_from_system_call': + * Stack layout in 'syscall_exit': * ptrace needs to have all regs on the stack. * if the order here is changed, it needs to be * updated in fork.c:copy_process, signal.c:do_signal, @@ -132,7 +132,7 @@ VM_MASK = 0x00020000 movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; \ - movl $(__KERNEL_PDA), %edx; \ + movl $(__KERNEL_PERCPU), %edx; \ movl %edx, %fs #define RESTORE_INT_REGS \ @@ -305,16 +305,12 @@ sysenter_past_esp: pushl $(__USER_CS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET cs, 0*/ -#ifndef CONFIG_COMPAT_VDSO /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -#else - pushl $SYSENTER_RETURN -#endif CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eip, 0 @@ -342,7 +338,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -560,9 +556,7 @@ END(syscall_badsys) #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %fs:PDA_cpu, %ebx; \ - PER_CPU(cpu_gdt_descr, %ebx); \ - movl GDS_address(%ebx), %ebx; \ + PER_CPU(gdt_page, %ebx); \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ addl %esp, %eax; \ pushl $__KERNEL_DS; \ @@ -635,7 +629,7 @@ ENTRY(name) \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ - call smp_/**/name; \ + call smp_##name; \ jmp ret_from_intr; \ CFI_ENDPROC; \ ENDPROC(name) @@ -643,11 +637,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -/* This alternate entry is needed because we hijack the apic LVTT */ -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) -#endif - KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault @@ -686,7 +675,7 @@ error_code: pushl %fs CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PDA), %ecx + movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 3fa7f93..9b10af6 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -34,17 +34,32 @@ /* * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. We need one bit for - * each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) + * and including _end* we need mapped initially. + * We need: + * - one bit for each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * - enough space to map all low memory, which means + * (2^32/4096) / 1024 pages (worst case, non PAE) + * (2^32/4096) / 512 + 4 pages (worst case for PAE) + * - a few pages for allocator use before the kernel pagetable has + * been set up * * Modulo rounding, each megabyte assigned here requires a kilobyte of * memory, which is currently unreclaimed. * * This should be a multiple of a page. */ -#define INIT_MAP_BEYOND_END (128*1024) +LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +#if PTRS_PER_PMD > 1 +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#else +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#endif +BOOTBITMAP_SIZE = LOW_PAGES / 8 +ALLOCATOR_SLOP = 4 + +INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm /* * 32-bit kernel entrypoint; only used by the boot CPU. On entry, @@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but - * we know the trampoline has already loaded the boot_gdt_table GDT - * for us. + * we know the trampoline has already loaded the boot_gdt for us. * * If cpu hotplug is not supported then this code can go in init section * which will be freed later @@ -318,12 +332,12 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - call setup_pda lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. + movl %eax,%fs # gets reset once there's real percpu movl $(__USER_DS),%eax # DS/ES contains default USER segment movl %eax,%ds @@ -333,16 +347,17 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax - movl $(__KERNEL_PDA),%eax - mov %eax,%fs - cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP movb ready, %cl movb $1, ready cmpb $0,%cl # the first CPU calls start_kernel - jne initialize_secondary # all other CPUs call initialize_secondary + je 1f + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + jmp initialize_secondary # all other CPUs call initialize_secondary +1: #endif /* CONFIG_SMP */ jmp start_kernel @@ -366,23 +381,6 @@ check_x87: ret /* - * Point the GDT at this CPU's PDA. On boot this will be - * cpu_gdt_table and boot_pda; for secondary CPUs, these will be - * that CPU's GDT and PDA. - */ -ENTRY(setup_pda) - /* get the PDA pointer */ - movl start_pda, %eax - - /* slot the PDA address into the GDT */ - mov early_gdt_descr+2, %ecx - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ - shr $16, %eax - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ - ret - -/* * setup_idt * * sets up a idt with 256 entries pointing to @@ -554,9 +552,6 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data -ENTRY(start_pda) - .long boot_pda - ENTRY(stack_start) .long init_thread_union+THREAD_SIZE .long __BOOT_DS @@ -588,7 +583,7 @@ fault_msg: .word 0 # 32 bit align gdt_desc.address boot_gdt_descr: .word __BOOT_DS+7 - .long boot_gdt_table - __PAGE_OFFSET + .long boot_gdt - __PAGE_OFFSET .word 0 # 32-bit align idt_desc.address idt_descr: @@ -599,67 +594,14 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long cpu_gdt_table + .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ /* - * The boot_gdt_table must mirror the equivalent in setup.S and is + * The boot_gdt must mirror the equivalent in setup.S and is * used only for booting. */ .align L1_CACHE_BYTES -ENTRY(boot_gdt_table) +ENTRY(boot_gdt) .fill GDT_ENTRY_BOOT_CS,8,0 .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ - -/* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ - .align L1_CACHE_BYTES -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* - * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - .quad 0x00409a000000ffff /* 0x90 32-bit code */ - .quad 0x00009a000000ffff /* 0x98 16-bit code */ - .quad 0x000092000000ffff /* 0xa0 16-bit data */ - .quad 0x0000920000000000 /* 0xa8 16-bit data */ - .quad 0x0000920000000000 /* 0xb0 16-bit data */ - - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - .quad 0x00409a000000ffff /* 0xb8 APM CS code */ - .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x004092000000ffff /* 0xc8 APM DS data */ - - .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x00cf92000000ffff /* 0xd8 - PDA */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 4afe26e..e3d4b73 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed); #endif EXPORT_SYMBOL(csum_partial); - -EXPORT_SYMBOL(_proxy_pda); diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 89d85d2..1b623cd 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -35,6 +35,7 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> +#include <linux/kthread.h> #include <asm/io.h> #include <asm/smp.h> @@ -661,8 +662,6 @@ static int balanced_irq(void *unused) unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; - daemonize("kirqd"); - /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < NR_IRQS ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); @@ -722,10 +721,9 @@ static int __init balanced_irq_init(void) } printk(KERN_INFO "Starting balanced_irq\n"); - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) return 0; - else - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for_each_possible_cpu(i) { kfree(irq_cpu_data[i].irq_delta); @@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -static inline void UNEXPECTED_IO_APIC(void) -{ -} - void __init print_IO_APIC(void) { int apic, i; @@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void) if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } /* @@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void) reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 498e8bc..d1e42e0 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -16,6 +16,7 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/thread_info.h> +#include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) @@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * Reset the owner so that a process switch will not set * tss->io_bitmap_base to IO_BITMAP_OFFSET. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; put_cpu(); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 8db8d51..d2daf67 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -24,6 +24,9 @@ DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 4f5983c..0952ecc 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 84c3497..33cf2f3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -20,7 +20,6 @@ #include <linux/sysdev.h> #include <linux/sysctl.h> #include <linux/percpu.h> -#include <linux/dmi.h> #include <linux/kprobes.h> #include <linux/cpumask.h> #include <linux/kernel_stat.h> @@ -28,30 +27,14 @@ #include <asm/smp.h> #include <asm/nmi.h> #include <asm/kdebug.h> -#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" int unknown_nmi_panic; int nmi_watchdog_enabled; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -extern void show_registers(struct pt_regs *regs); -extern int unknown_nmi_panic; - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_PERFCTR0); - case 15: - return (msr - MSR_P4_BPU_PERFCTR0); - } - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - - switch (boot_cpu_data.x86) { - case 6: - return (msr - MSR_P6_EVNTSEL0); - case 15: - return (msr - MSR_P4_BSU_ESCR0); - } - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_perfctr_nmi(cpu, msr); - } -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) - || (boot_cpu_data.x86 == 16)); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); - } - return 0; -} - static int endflag __initdata = 0; #ifdef CONFIG_SMP @@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(prev_nmi_count); return 0; @@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write_around(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} +/* Suspend/resume support */ #ifdef CONFIG_PM @@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_P6_PERFCTR0; - evntsel_msr = MSR_P6_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p6_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) +static void __acpi_nmi_enable(void *__unused) { - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; + apic_write_around(APIC_LVT0, APIC_DM_NMI); } -static void stop_p4_watchdog(void) +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); } -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) +static void __acpi_nmi_disable(void *__unused) { - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } -static void stop_intel_arch_watchdog(void) +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) { - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } void setup_apic_nmi_watchdog (void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) - return; + if (__get_cpu_var(wd_enabled)) + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; - - if (!setup_p6_watchdog()) - return; - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - return; - - if (!setup_p4_watchdog()) - return; - break; - default: - return; - } - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - stop_p6_watchdog(); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; - stop_p4_watchdog(); - break; - } - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; int rc=0; /* check for other users first */ @@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) alert_counter[cpu] = 0; } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel P4 uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL); - } else { - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, } if (nmi_watchdog == NMI_DEFAULT) { - if (nmi_known_cpu() > 0) + if (lapic_watchdog_ok()) nmi_watchdog = NMI_LOCAL_APIC; else nmi_watchdog = NMI_IO_APIC; @@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 2ec331e..5c10f37 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -20,6 +20,7 @@ #include <linux/efi.h> #include <linux/bcd.h> #include <linux/start_kernel.h> +#include <linux/highmem.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -35,7 +36,7 @@ #include <asm/timer.h> /* nop stub */ -static void native_nop(void) +void _paravirt_nop(void) { } @@ -54,331 +55,148 @@ char *memory_setup(void) #define DEF_NATIVE(name, code) \ extern const char start_##name[], end_##name[]; \ asm("start_" #name ": " code "; end_" #name ":") -DEF_NATIVE(cli, "cli"); -DEF_NATIVE(sti, "sti"); -DEF_NATIVE(popf, "push %eax; popf"); -DEF_NATIVE(pushf, "pushf; pop %eax"); -DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); + +DEF_NATIVE(irq_disable, "cli"); +DEF_NATIVE(irq_enable, "sti"); +DEF_NATIVE(restore_fl, "push %eax; popf"); +DEF_NATIVE(save_fl, "pushf; pop %eax"); DEF_NATIVE(iret, "iret"); -DEF_NATIVE(sti_sysexit, "sti; sysexit"); +DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(clts, "clts"); +DEF_NATIVE(read_tsc, "rdtsc"); -static const struct native_insns -{ - const char *start, *end; -} native_insns[] = { - [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, - [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, - [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, - [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, - [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, - [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, - [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, -}; +DEF_NATIVE(ud2a, "ud2a"); static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) { - unsigned int insn_len; - - /* Don't touch it if we don't have a replacement */ - if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) - return len; - - insn_len = native_insns[type].end - native_insns[type].start; - - /* Similarly if we can't fit replacement. */ - if (len < insn_len) - return len; + const unsigned char *start, *end; + unsigned ret; + + switch(type) { +#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site + SITE(irq_disable); + SITE(irq_enable); + SITE(restore_fl); + SITE(save_fl); + SITE(iret); + SITE(irq_enable_sysexit); + SITE(read_cr2); + SITE(read_cr3); + SITE(write_cr3); + SITE(clts); + SITE(read_tsc); +#undef SITE + + patch_site: + ret = paravirt_patch_insns(insns, len, start, end); + break; - memcpy(insns, native_insns[type].start, insn_len); - return insn_len; -} + case PARAVIRT_PATCH(make_pgd): + case PARAVIRT_PATCH(make_pte): + case PARAVIRT_PATCH(pgd_val): + case PARAVIRT_PATCH(pte_val): +#ifdef CONFIG_X86_PAE + case PARAVIRT_PATCH(make_pmd): + case PARAVIRT_PATCH(pmd_val): +#endif + /* These functions end up returning exactly what + they're passed, in the same registers. */ + ret = paravirt_patch_nop(); + break; -static unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("movl %%db0, %0" :"=r" (val)); break; - case 1: - asm("movl %%db1, %0" :"=r" (val)); break; - case 2: - asm("movl %%db2, %0" :"=r" (val)); break; - case 3: - asm("movl %%db3, %0" :"=r" (val)); break; - case 6: - asm("movl %%db6, %0" :"=r" (val)); break; - case 7: - asm("movl %%db7, %0" :"=r" (val)); break; default: - BUG(); - } - return val; -} - -static void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("movl %0,%%db0" : /* no output */ :"r" (value)); - break; - case 1: - asm("movl %0,%%db1" : /* no output */ :"r" (value)); - break; - case 2: - asm("movl %0,%%db2" : /* no output */ :"r" (value)); + ret = paravirt_patch_default(type, clobbers, insns, len); break; - case 3: - asm("movl %0,%%db3" : /* no output */ :"r" (value)); - break; - case 6: - asm("movl %0,%%db6" : /* no output */ :"r" (value)); - break; - case 7: - asm("movl %0,%%db7" : /* no output */ :"r" (value)); - break; - default: - BUG(); } -} - -void init_IRQ(void) -{ - paravirt_ops.init_IRQ(); -} - -static void native_clts(void) -{ - asm volatile ("clts"); -} - -static unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr0(unsigned long val) -{ - asm volatile("movl %0,%%cr0": :"r" (val)); -} - -static unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr2(unsigned long val) -{ - asm volatile("movl %0,%%cr2": :"r" (val)); -} - -static unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); - return val; -} - -static void native_write_cr3(unsigned long val) -{ - asm volatile("movl %0,%%cr3": :"r" (val)); -} - -static unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); - return val; -} - -static unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist */ - asm("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (val): "0" (0)); - return val; -} - -static void native_write_cr4(unsigned long val) -{ - asm volatile("movl %0,%%cr4": :"r" (val)); -} - -static unsigned long native_save_fl(void) -{ - unsigned long f; - asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); - return f; -} - -static void native_restore_fl(unsigned long f) -{ - asm volatile("pushl %0 ; popfl": /* no output */ - :"g" (f) - :"memory", "cc"); -} - -static void native_irq_disable(void) -{ - asm volatile("cli": : :"memory"); -} - -static void native_irq_enable(void) -{ - asm volatile("sti": : :"memory"); -} - -static void native_safe_halt(void) -{ - asm volatile("sti; hlt": : :"memory"); -} -static void native_halt(void) -{ - asm volatile("hlt": : :"memory"); + return ret; } -static void native_wbinvd(void) +unsigned paravirt_patch_nop(void) { - asm volatile("wbinvd": : :"memory"); + return 0; } -static unsigned long long native_read_msr(unsigned int msr, int *err) +unsigned paravirt_patch_ignore(unsigned len) { - unsigned long long val; - - asm volatile("2: rdmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %3,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=r" (*err), "=A" (val) - : "c" (msr), "i" (-EFAULT)); - - return val; + return len; } -static int native_write_msr(unsigned int msr, unsigned long long val) +unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, + void *site, u16 site_clobbers, + unsigned len) { - int err; - asm volatile("2: wrmsr ; xorl %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: movl %4,%0 ; jmp 1b\n\t" - ".previous\n\t" - ".section __ex_table,\"a\"\n" - " .align 4\n\t" - " .long 2b,3b\n\t" - ".previous" - : "=a" (err) - : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), - "i" (-EFAULT)); - return err; -} + unsigned char *call = site; + unsigned long delta = (unsigned long)target - (unsigned long)(call+5); -static unsigned long long native_read_tsc(void) -{ - unsigned long long val; - asm volatile("rdtsc" : "=A" (val)); - return val; -} + if (tgt_clobbers & ~site_clobbers) + return len; /* target would clobber too much for this site */ + if (len < 5) + return len; /* call too long for patch site */ -static unsigned long long native_read_pmc(void) -{ - unsigned long long val; - asm volatile("rdpmc" : "=A" (val)); - return val; -} + *call++ = 0xe8; /* call */ + *(unsigned long *)call = delta; -static void native_load_tr_desc(void) -{ - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); + return 5; } -static void native_load_gdt(const struct Xgt_desc_struct *dtr) +unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) { - asm volatile("lgdt %0"::"m" (*dtr)); -} + unsigned char *jmp = site; + unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); -static void native_load_idt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} + if (len < 5) + return len; /* call too long for patch site */ -static void native_store_gdt(struct Xgt_desc_struct *dtr) -{ - asm ("sgdt %0":"=m" (*dtr)); -} + *jmp++ = 0xe9; /* jmp */ + *(unsigned long *)jmp = delta; -static void native_store_idt(struct Xgt_desc_struct *dtr) -{ - asm ("sidt %0":"=m" (*dtr)); + return 5; } -static unsigned long native_store_tr(void) +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len) { - unsigned long tr; - asm ("str %0":"=r" (tr)); - return tr; -} + void *opfunc = *((void **)¶virt_ops + type); + unsigned ret; -static void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C -} + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a); + else if (opfunc == paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); + else if (type == PARAVIRT_PATCH(iret) || + type == PARAVIRT_PATCH(irq_enable_sysexit)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(opfunc, site, len); + else + /* Otherwise call the function; assume target could + clobber any caller-save reg */ + ret = paravirt_patch_call(opfunc, CLBR_ANY, + site, clobbers, len); -static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) -{ - u32 *lp = (u32 *)((char *)dt + entry*8); - lp[0] = entry_low; - lp[1] = entry_high; + return ret; } -static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +unsigned paravirt_patch_insns(void *site, unsigned len, + const char *start, const char *end) { - native_write_dt_entry(dt, entrynum, low, high); -} + unsigned insn_len = end - start; -static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} - -static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) -{ - native_write_dt_entry(dt, entrynum, low, high); -} + if (insn_len > len || start == NULL) + insn_len = len; + else + memcpy(site, start, insn_len); -static void native_load_esp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } + return insn_len; } -static void native_io_delay(void) +void init_IRQ(void) { - asm volatile("outb %al,$0x80"); + paravirt_ops.init_IRQ(); } static void native_flush_tlb(void) @@ -395,83 +213,11 @@ static void native_flush_tlb_global(void) __native_flush_tlb_global(); } -static void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(unsigned long addr) { __native_flush_tlb_single(addr); } -#ifndef CONFIG_X86_PAE -static void native_set_pte(pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) -{ - *ptep = pteval; -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - *pmdp = pmdval; -} - -#else /* CONFIG_X86_PAE */ - -static void native_set_pte(pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) -{ - set_64bit((unsigned long long *)ptep,pte_val(pteval)); -} - -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); -} - -static void native_set_pud(pud_t *pudp, pud_t pudval) -{ - *pudp = pudval; -} - -static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = 0; -} - -static void native_pmd_clear(pmd_t *pmd) -{ - u32 *tmp = (u32 *)pmd; - *tmp = 0; - smp_wmb(); - *(tmp + 1) = 0; -} -#endif /* CONFIG_X86_PAE */ - /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ .patch = native_patch, .banner = default_banner, - .arch_setup = native_nop, + .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, @@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = { .safe_halt = native_safe_halt, .halt = native_halt, .wbinvd = native_wbinvd, - .read_msr = native_read_msr, - .write_msr = native_write_msr, + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .get_scheduled_cycles = native_read_tsc, @@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = native_write_ldt_entry, - .write_gdt_entry = native_write_gdt_entry, - .write_idt_entry = native_write_idt_entry, + .write_ldt_entry = write_dt_entry, + .write_gdt_entry = write_dt_entry, + .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, .set_iopl_mask = native_set_iopl_mask, @@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, + .startup_ipi_hook = paravirt_nop, #endif - .set_lazy_mode = (void *)native_nop, + .set_lazy_mode = paravirt_nop, + + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_others = native_flush_tlb_others, - .map_pt_hook = (void *)native_nop, - - .alloc_pt = (void *)native_nop, - .alloc_pd = (void *)native_nop, - .alloc_pd_clone = (void *)native_nop, - .release_pt = (void *)native_nop, - .release_pd = (void *)native_nop, + .alloc_pt = paravirt_nop, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pt = paravirt_nop, + .release_pd = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, - .pte_update = (void *)native_nop, - .pte_update_defer = (void *)native_nop, + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = kmap_atomic, +#endif + #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, + + .pmd_val = native_pmd_val, + .make_pmd = native_make_pmd, #endif + .pte_val = native_pte_val, + .pgd_val = native_pgd_val, + + .make_pte = native_make_pte, + .make_pgd = native_make_pgd, + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, - .startup_ipi_hook = (void *)native_nop, + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, + .activate_mm = paravirt_nop, }; -/* - * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops - * semantics are subject to change. Hence we only do this - * internal-only export of this, until it gets sorted out and - * all lowlevel CPU ops used by modules are separately exported. - */ -EXPORT_SYMBOL_GPL(paravirt_ops); +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 393a67d..6199947 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -39,6 +39,7 @@ #include <linux/random.h> #include <linux/personality.h> #include <linux/tick.h> +#include <linux/percpu.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -57,7 +58,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/pda.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -66,6 +66,12 @@ static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * Return saved PC of a blocked thread. */ @@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c) } } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; #ifdef CONFIG_X86_SMP if (smp_num_siblings > 1) printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); #endif - } else if (!strncmp(str, "halt", 4)) { - printk("using halt in idle threads.\n"); - pm_idle = default_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { @@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xfs = __KERNEL_PDA; + regs.xfs = __KERNEL_PERCPU; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -376,7 +381,7 @@ void exit_thread(void) t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; tss->io_bitmap_max = 0; - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } } @@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * Disable the bitmap via an invalid offset. We still cache * the previous bitmap owner and the IO bitmap contents: */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; return; } @@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * matches the next task, we dont have to do anything but * to set a valid offset in the TSS: */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; return; } /* @@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, * redundant copies when the currently switched task does not * perform any I/O during its timeslice. */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } /* @@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (prev->gs | next->gs) loadsegment(gs, next->gs); - write_pda(pcurrent, next_p); + x86_write_percpu(current_task, next_p); return prev_p; } diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 34874c3..9f6ab17 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,12 +3,10 @@ */ #include <linux/pci.h> #include <linux/irq.h> -#include <asm/pci-direct.h> -#include <asm/genapic.h> -#include <asm/cpu.h> #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) * based platforms. - * For those platforms, make sure that the genapic is set to 'flat' + * Disable SW irqbalance/affinity on those platforms. */ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) return; + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + /* enable access to config space*/ pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); @@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif - } - - /* put back the original value for config space*/ - if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); -} - -void __init quirk_intel_irqbalance(void) -{ - u8 config, rev; - u32 word; - - /* BIOS may enable hardware IRQ balancing for - * E7520/E7320/E7525(revision ID 0x9 and below) - * based platforms. - * Disable SW irqbalance/affinity on those platforms. - */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); - if (rev > 0x9) - return; - - printk(KERN_INFO "Intel E7520/7320/7525 detected."); - - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); - - /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); - - if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); #ifdef CONFIG_IRQBALANCE irqbalance_disable(""); @@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void) #ifdef CONFIG_PROC_FS no_irq_affinity = 1; #endif -#ifdef CONFIG_HOTPLUG_CPU - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; -#endif -#ifdef CONFIG_X86_64 - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; -#endif } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 3514b41..50dfc65 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -17,7 +17,8 @@ #include <asm/apic.h> #include <asm/desc.h> #include "mach_reboot.h" -#include <linux/reboot_fixups.h> +#include <asm/reboot_fixups.h> +#include <asm/reboot.h> /* * Power off function, if any @@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] = */ void machine_real_restart(unsigned char *code, int length) { - unsigned long flags; - local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length) safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); - spin_unlock_irqrestore(&rtc_lock, flags); + spin_unlock(&rtc_lock); /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at @@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length) EXPORT_SYMBOL(machine_real_restart); #endif -void machine_shutdown(void) +static void native_machine_shutdown(void) { #ifdef CONFIG_SMP int reboot_cpu_id; @@ -316,7 +315,11 @@ void machine_shutdown(void) #endif } -void machine_emergency_restart(void) +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +static void native_machine_emergency_restart(void) { if (!reboot_thru_bios) { if (efi_enabled) { @@ -340,17 +343,17 @@ void machine_emergency_restart(void) machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); } -void machine_restart(char * __unused) +static void native_machine_restart(char * __unused) { machine_shutdown(); machine_emergency_restart(); } -void machine_halt(void) +static void native_machine_halt(void) { } -void machine_power_off(void) +static void native_machine_power_off(void) { if (pm_power_off) { machine_shutdown(); @@ -359,3 +362,35 @@ void machine_power_off(void) } +struct machine_ops machine_ops = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c index 99aab41..2d78d91 100644 --- a/arch/i386/kernel/reboot_fixups.c +++ b/arch/i386/kernel/reboot_fixups.c @@ -10,7 +10,7 @@ #include <asm/delay.h> #include <linux/pci.h> -#include <linux/reboot_fixups.h> +#include <asm/reboot_fixups.h> static void cs5530a_warm_reset(struct pci_dev *dev) { diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 0e89778..89a45a9 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector) } /* - * This is only used on smaller machines. + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +static inline void __send_IPI_dest_field(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long cfg; - unsigned long flags; - local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. */ - apic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + apic_wait_icr_idle(); /* * prepare target chip field @@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); +} + +/* + * This is only used on smaller machines. + */ +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long flags; + local_irq_save(flags); + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; + unsigned long flags; unsigned int query_cpu; /* @@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { if (cpu_isset(query_cpu, mask)) { - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu)); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), + vector); } } local_irq_restore(flags); @@ -256,7 +248,6 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* * We cannot call mmdrop() because we are in interrupt context, @@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); @@ -353,9 +344,11 @@ out: put_cpu_no_resched(); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { + cpumask_t cpumask = *cpumaskp; + /* * A couple of (to be removed) sanity checks: * @@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); +#ifdef CONFIG_HOTPLUG_CPU /* If a CPU which we ran on has gone down, OK. */ cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) + if (unlikely(cpus_empty(cpumask))) return; +#endif /* * i'm not happy about this global shared spinlock in the @@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, flush_mm = mm; flush_va = va; -#if NR_CPUS <= BITS_PER_LONG - atomic_set_mask(cpumask, &flush_cpumask); -#else - { - int k; - unsigned long *flush_mask = (unsigned long *)&flush_cpumask; - unsigned long *cpu_mask = (unsigned long *)&cpumask; - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) - atomic_set_mask(cpu_mask[k], &flush_mask[k]); - } -#endif + cpus_or(flush_cpumask, cpumask, flush_cpumask); /* * We have to send the IPI only to * CPUs affected. @@ -417,7 +402,7 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -483,7 +468,7 @@ void flush_tlb_all(void) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void native_smp_send_reschedule(int cpu) { WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); @@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void) static struct call_data_struct *call_data; +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus() - 1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + + /** - * smp_call_function(): Run a function on all other CPUs. + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. * @wait: If true, wait (atomically) until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +int native_smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; + cpumask_t allbutself; int cpus; + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); - cpus = num_online_cpus() - 1; + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + if (!cpus) { spin_unlock(&call_lock); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, call_data = &data; mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) @@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, return 0; } + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + return smp_call_function_mask(cpu_online_map, func, info, wait); +} EXPORT_SYMBOL(smp_call_function); +/** + * smp_call_function_single - Run a function on another CPU + * @cpu: The target CPU. Cannot be the calling CPU. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + */ +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int ret; + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + + put_cpu(); + return ret; +} +EXPORT_SYMBOL(smp_call_function_single); + static void stop_this_cpu (void * dummy) { + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) halt(); @@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy) * this function calls the 'stop' function on all other CPUs in the system. */ -void smp_send_stop(void) +void native_smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 1, 0); + /* Don't deadlock on the call lock in panic */ + int nolock = !spin_trylock(&call_lock); + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* @@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) } } -/* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. - */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = 1; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function_single - Run a function on another CPU - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: Currently unused. - * @wait: If true, wait until function has completed on other CPUs. - * - * Retrurns 0 on success, else a negative status code. - * - * Does not return until the remote CPU is nearly ready to execute <func> - * or is or has executed. - */ - -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - /* prevent preemption and reschedule on another processor */ - int me = get_cpu(); - if (cpu == me) { - WARN_ON(1); - put_cpu(); - return -EBUSY; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); - put_cpu(); - return 0; -} -EXPORT_SYMBOL(smp_call_function_single); - static int convert_apicid_to_cpu(int apic_id) { int i; @@ -730,3 +747,14 @@ int safe_smp_processor_id(void) return cpuid >= 0 ? cpuid : 0; } + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .cpu_up = native_cpu_up, + .smp_cpus_done = native_smp_cpus_done, + + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, + .smp_call_function_mask = native_smp_call_function_mask, +}; diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4ff55e6..a4b7ad2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -53,13 +53,12 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/nmi.h> -#include <asm/pda.h> -#include <asm/genapic.h> #include <mach_apic.h> #include <mach_wakecpu.h> #include <smpboot_hooks.h> #include <asm/vmi.h> +#include <asm/mtrr.h> /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; +DEFINE_PER_CPU(unsigned long, this_cpu_off); +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + /* * Trampoline 80x86 program as an array. */ @@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id) *c = boot_cpu_data; if (id!=0) - identify_cpu(c); + identify_secondary_cpu(c); /* * Mask B, Pentium, but not Pentium MMX */ @@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu) static void __cpuinit start_secondary(void *unused) { /* - * Don't put *anything* before secondary_cpu_init(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. + * Don't put *anything* before cpu_init(), SMP booting is too + * fragile that we want to limit the things done here to the + * most necessary things. */ #ifdef CONFIG_VMI vmi_bringup(); #endif - secondary_cpu_init(); + cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) @@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused) void __devinit initialize_secondary(void) { /* - * switch to the per CPU GDT we already set up - * in do_boot_cpu() - */ - cpu_set_gdt(current_thread_info()->cpu); - - /* * We don't actually need to load the full TSS, * basically just the stack pointer and the eip. */ @@ -463,7 +459,6 @@ extern struct { void * esp; unsigned short ss; } stack_start; -extern struct i386_pda *start_pda; #ifdef CONFIG_NUMA @@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu) unmap_cpu_to_node(cpu); } -#if APIC_DEBUG static inline void __inquire_remote_apic(int apicid) { int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned long status; printk("Inquiring remote APIC #%d...\n", apicid); @@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk("%lx\n", status); break; default: printk("failed\n"); } } } -#endif #ifdef WAKE_SECONDARY_VIA_NMI /* @@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid) static int __devinit wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int timeout, maxlvt; + unsigned long send_status, accept_status = 0; + int maxlvt; /* Target chip */ apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); @@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) static int __devinit wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; /* * Be paranoid about clearing APIC errors. @@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); atomic_set(&init_deasserted, 1); @@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. @@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu) #define alloc_idle_task(cpu) fork_idle(cpu) #endif +/* Initialize the CPU's GDT. This is either the boot CPU doing itself + (still using the master per-cpu area), or a CPU doing it for a + secondary which will soon come up. */ +static __cpuinit void init_gdt(int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, + (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + __per_cpu_offset[cpu], 0xFFFFF, + 0x80 | DESCTYPE_S | 0x2, 0x8); + + per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; + per_cpu(cpu_number, cpu) = cpu; +} + +/* Defined in head.S */ +extern struct Xgt_desc_struct early_gdt_descr; + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) unsigned short nmi_high = 0, nmi_low = 0; /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ @@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - /* Pre-allocate and initialize the CPU's GDT and PDA so it - doesn't have to do any memory allocation during the - delicate CPU-bringup phase. */ - if (!init_gdt(cpu, idle)) { - printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); - return -1; /* ? */ - } + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ @@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) DECLARE_COMPLETION_ONSTACK(done); struct warm_boot_cpu_info info; int apicid, ret; - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) goto exit; } - /* - * the CPU isn't initialized at boot time, allocate gdt table here. - * cpu_init will initialize it - */ - if (!cpu_gdt_descr->address) { - cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); - if (!cpu_gdt_descr->address) - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - ret = -ENOMEM; - goto exit; - } - info.complete = &done; info.apicid = apicid; info.cpu = cpu; @@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ -void __init smp_prepare_cpus(unsigned int max_cpus) +void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_commenced_mask = cpumask_of_cpu(0); cpu_callin_map = cpumask_of_cpu(0); @@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus) smp_boot_cpus(max_cpus); } -void __devinit smp_prepare_boot_cpu(void) +void __init native_smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); - cpu_set(smp_processor_id(), cpu_present_map); - cpu_set(smp_processor_id(), cpu_possible_map); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unsigned int cpu = smp_processor_id(); + + init_gdt(cpu); + switch_to_new_gdt(); + + cpu_set(cpu, cpu_online_map); + cpu_set(cpu, cpu_callout_map); + cpu_set(cpu, cpu_present_map); + cpu_set(cpu, cpu_possible_map); + __get_cpu_var(cpu_state) = CPU_ONLINE; } #ifdef CONFIG_HOTPLUG_CPU @@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu) { unsigned long flags; #ifdef CONFIG_HOTPLUG_CPU @@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu) touch_nmi_watchdog(); } -#ifdef CONFIG_X86_GENERICARCH - if (num_online_cpus() > 8 && genapic == &apic_default) - panic("Default flat APIC routing can't be used with > 8 cpus\n"); -#endif - return 0; } -void __init smp_cpus_done(unsigned int max_cpus) +void __init native_smp_cpus_done(unsigned int max_cpus) { #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 13ca54a..ff4ee6f 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -22,16 +22,26 @@ #include <asm/msr.h> #include <asm/pgtable.h> #include <asm/unistd.h> +#include <asm/elf.h> +#include <asm/tlbflush.h> + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -#ifdef CONFIG_PARAVIRT -unsigned int __read_mostly vdso_enabled = 0; -#else -unsigned int __read_mostly vdso_enabled = 1; -#endif +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; EXPORT_SYMBOL_GPL(vdso_enabled); @@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup); extern asmlinkage void sysenter_entry(void); +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_HIGH_BASE; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_HIGH_BASE; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + !elf_check_arch(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_HIGH_BASE; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_HIGH_BASE; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_HIGH_BASE; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -56,14 +183,33 @@ void enable_sep_cpu(void) return; } - tss->ss1 = __KERNEL_CS; - tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); put_cpu(); } +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + /* * These symbols are defined by vsyscall.o to mark the bounds * of the ELF DSO images included therein. @@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; static struct page *syscall_pages[1]; +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + syscall_pages[0] = virt_to_page(syscall_page); -#ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); + gate_vma_init(); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); -#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(syscall_page, - &vsyscall_int80_start, - &vsyscall_int80_end - &vsyscall_int80_start); - return 0; + vsyscall = &vsyscall_int80_start; + vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + } else { + vsyscall = &vsyscall_sysenter_start; + vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; } - memcpy(syscall_page, - &vsyscall_sysenter_start, - &vsyscall_sysenter_end - &vsyscall_sysenter_start); + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); return 0; } -#ifndef CONFIG_COMPAT_VDSO /* Defined in vsyscall-sysenter.S */ extern void SYSENTER_RETURN; @@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { struct mm_struct *mm = current->mm; unsigned long addr; - int ret; + int ret = 0; + bool compat; down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall_pages); - if (ret) - goto up_fail; + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + syscall_pages); + + if (ret) + goto up_fail; + } current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); -up_fail: + (void *)VDSO_SYM(&SYSENTER_RETURN); + + up_fail: up_write(&mm->mmap_sem); + return ret; } @@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma) struct vm_area_struct *get_gate_vma(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; return NULL; } @@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } -#endif diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 94e5cb0..a665df6 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -70,8 +70,6 @@ #include <asm/i8259.h> -int pit_latch_buggy; /* extern */ - #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S index 2f1814c..f62815f 100644 --- a/arch/i386/kernel/trampoline.S +++ b/arch/i386/kernel/trampoline.S @@ -29,7 +29,7 @@ * * TYPE VALUE * R_386_32 startup_32_smp - * R_386_32 boot_gdt_table + * R_386_32 boot_gdt */ #include <linux/linkage.h> @@ -62,8 +62,8 @@ r_base = . * to 32 bit. */ - lidtl boot_idt - r_base # load idt with 0, 0 - lgdtl boot_gdt - r_base # load gdt with whatever is appropriate + lidtl boot_idt_descr - r_base # load idt with 0, 0 + lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit @@ -73,11 +73,11 @@ r_base = . # These need to be in the same 64K segment as the above; # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt: +boot_gdt_descr: .word __BOOT_DS + 7 # gdt limit - .long boot_gdt_table-__PAGE_OFFSET # gdt base + .long boot_gdt - __PAGE_OFFSET # gdt base -boot_idt: +boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index af0d3f7..f21b41e 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, siginfo_t *info) { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (regs->eflags & VM_MASK) { if (vm86) @@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, goto kernel_trap; trap_signal: { + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) force_sig_info(signr, info, tsk); else @@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } kernel_trap: { - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } @@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, * and we set the offset field correctly. Then we let the CPU to * restart the faulting instruction. */ - if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && thread->io_bitmap_ptr) { memcpy(tss->io_bitmap, thread->io_bitmap_ptr, thread->io_bitmap_max); @@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, thread->io_bitmap_max, 0xff, tss->io_bitmap_max - thread->io_bitmap_max); tss->io_bitmap_max = thread->io_bitmap_max; - tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; tss->io_bitmap_owner = thread; put_cpu(); return; } put_cpu(); - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -624,6 +634,8 @@ gp_in_vm86: gp_in_kernel: if (!fixup_exception(regs)) { + current->thread.error_code = error_code; + current->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; @@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, fastcall unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { - int cpu = smp_processor_id(); - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; unsigned long base = (kesp - uesp) & -THREAD_SIZE; unsigned long new_kesp = kesp - base; unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 6cb8f53..f64b81f 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { if (!freq->old){ ref_freq = freq->new; - goto end; + return 0; } ref_freq = freq->old; loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; @@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) * TSC based sched_clock turns * to junk w/ cpufreq */ - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } } } -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); return 0; } @@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; + printk("Marking TSC unstable due to: %s.\n", reason); /* Can be called before registration */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S new file mode 100644 index 0000000..e51a869 --- /dev/null +++ b/arch/i386/kernel/verify_cpu.S @@ -0,0 +1,65 @@ +/* Check if CPU has some minimum CPUID bits + This runs in 16bit mode so that the caller can still use the BIOS + to output errors on the screen */ +#include <asm/cpufeature.h> + +verify_cpu: + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + +#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4 + pushfl + orl $(1<<18),(%esp) # try setting AC + popfl + pushfl + popl %eax + testl $(1<<18),%eax + jz bad +#endif +#if REQUIRED_MASK1 != 0 + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz bad # REQUIRED_MASK1 != 0 requires CPUID + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb bad # no cpuid 1 + + movl $0x1,%eax # Does the cpu have what it takes + cpuid + +#if CONFIG_X86_MINIMUM_CPU_MODEL > 4 +#error add proper model checking here +#endif + + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz bad +#endif /* REQUIRED_MASK1 */ + + popfl + xor %eax,%eax + ret + +bad: + popfl + movl $1,%eax + ret diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 697a70e..c8726c4 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -26,6 +26,7 @@ #include <linux/cpu.h> #include <linux/bootmem.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <asm/vmi.h> #include <asm/io.h> #include <asm/fixmap.h> @@ -56,7 +57,7 @@ static int disable_noidle; static int disable_vmi_timer; /* Cached VMI operations */ -struct { +static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); @@ -65,16 +66,15 @@ struct { void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); void (*update_pte)(pte_t *, unsigned); - void (*set_linear_mapping)(int, u32, u32, u32); - void (*flush_tlb)(int); + void (*set_linear_mapping)(int, void *, u32, u32); + void (*_flush_tlb)(int); void (*set_initial_ap_state)(int, int); void (*halt)(void); void (*set_lazy_mode)(int mode); } vmi_ops; -/* XXX move this to alternative.h */ -extern struct paravirt_patch __start_parainstructions[], - __stop_parainstructions[]; +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; /* * VMI patching routines. @@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[], #define MNEM_JMP 0xe9 #define MNEM_RET 0xc3 -static char irq_save_disable_callout[] = { - MNEM_CALL, 0, 0, 0, 0, - MNEM_CALL, 0, 0, 0, 0, - MNEM_RET -}; #define IRQ_PATCH_INT_MASK 0 #define IRQ_PATCH_DISABLE 5 @@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns) static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) { switch (type) { - case PARAVIRT_IRQ_DISABLE: + case PARAVIRT_PATCH(irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns); - case PARAVIRT_IRQ_ENABLE: + case PARAVIRT_PATCH(irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns); - case PARAVIRT_RESTORE_FLAGS: + case PARAVIRT_PATCH(restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS: + case PARAVIRT_PATCH(save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns); - case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: - if (len >= 10) { - patch_internal(VMI_CALL_GetInterruptMask, len, insns); - patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); - return 10; - } else { - /* - * You bastards didn't leave enough room to - * patch save_flags_irq_disable inline. Patch - * to a helper - */ - BUG_ON(len < 5); - *(char *)insns = MNEM_CALL; - patch_offset(insns, irq_save_disable_callout); - return 5; - } - case PARAVIRT_INTERRUPT_RETURN: + case PARAVIRT_PATCH(iret): return patch_internal(VMI_CALL_IRET, len, insns); - case PARAVIRT_STI_SYSEXIT: + case PARAVIRT_PATCH(irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns); default: break; @@ -230,24 +209,24 @@ static void vmi_set_tr(void) static void vmi_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->esp0 = thread->esp0; + tss->x86_tss.esp0 = thread->esp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); } static void vmi_flush_tlb_user(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB); + vmi_ops._flush_tlb(VMI_FLUSH_TLB); } static void vmi_flush_tlb_kernel(void) { - vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); + vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); } /* Stub to do nothing at all; used for delays and unimplemented calls */ @@ -255,18 +234,6 @@ static void vmi_nop(void) { } -/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ -static fastcall void vmi_safe_halt(void) -{ - int idle = vmi_stop_hz_timer(); - vmi_ops.halt(); - if (idle) { - local_irq_disable(); - vmi_account_time_restart_hz_timer(); - local_irq_enable(); - } -} - #ifdef CONFIG_DEBUG_PAGE_TYPE #ifdef CONFIG_X86_PAE @@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type) #define vmi_check_page_type(p,t) do { } while (0) #endif -static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) +#ifdef CONFIG_HIGHPTE +static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { + void *va = kmap_atomic(page, type); + /* * Internally, the VMI ROM must map virtual addresses to physical * addresses for processing MMU updates. By the time MMU updates @@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn) * args: SLOT VA COUNT PFN */ BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn); + vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + return va; } +#endif static void vmi_allocate_pt(u32 pfn) { @@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn) ((level) | (is_current_as(mm, user) ? \ (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) -static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); @@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte) vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } -static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); @@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } -void vmi_pmd_clear(pmd_t *pmd) +static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); @@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd) #endif #ifdef CONFIG_SMP -extern void setup_pda(void); - static void __devinit vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) @@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, ap.ds = __USER_DS; ap.es = __USER_DS; - ap.fs = __KERNEL_PDA; + ap.fs = __KERNEL_PERCPU; ap.gs = 0; ap.eflags = 0; - setup_pda(); - #ifdef CONFIG_X86_PAE /* efer should match BSP efer. */ if (cpu_has_nx) { @@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(int mode) +static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) { - static DEFINE_PER_CPU(int, lazy_mode); + static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); if (!vmi_ops.set_lazy_mode) return; @@ -685,7 +654,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); } /* @@ -740,7 +709,6 @@ do { \ } \ } while (0) - /* * Activate the VMI interface and switch into paravirtualized mode */ @@ -796,12 +764,6 @@ static inline int __init activate_vmi(void) para_fill(irq_disable, DisableInterrupts); para_fill(irq_enable, EnableInterrupts); - /* irq_save_disable !!! sheer pain */ - patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], - (char *)paravirt_ops.save_fl); - patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], - (char *)paravirt_ops.irq_disable); - para_fill(wbinvd, WBINVD); para_fill(read_tsc, RDTSC); @@ -831,8 +793,8 @@ static inline int __init activate_vmi(void) para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB); + para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); para_fill(flush_tlb_single, InvalPage); /* @@ -878,8 +840,13 @@ static inline int __init activate_vmi(void) paravirt_ops.release_pt = vmi_release_pt; paravirt_ops.release_pd = vmi_release_pd; } - para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping, - SetLinearMapping); + + /* Set linear is needed in all cases */ + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); +#ifdef CONFIG_HIGHPTE + if (vmi_ops.set_linear_mapping) + paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; +#endif /* * These MUST always be patched. Don't support indirect jumps @@ -920,8 +887,8 @@ static inline int __init activate_vmi(void) paravirt_ops.get_wallclock = vmi_get_wallclock; paravirt_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; - paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; + paravirt_ops.setup_boot_clock = vmi_time_bsp_init; + paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; paravirt_ops.get_cpu_khz = vmi_cpu_khz; @@ -933,11 +900,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - /* No idle HZ mode only works if VMI timer and no idle is enabled */ - if (disable_noidle || disable_vmi_timer) - para_fill(safe_halt, Halt); - else - para_wrap(safe_halt, vmi_safe_halt, halt, Halt); + para_fill(safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough @@ -945,7 +908,7 @@ static inline int __init activate_vmi(void) * to do this before IRQs get reenabled. Fortunately, it is * idempotent. */ - apply_paravirt(__start_parainstructions, __stop_parainstructions); + apply_paravirt(__parainstructions, __parainstructions_end); vmi_bringup(); diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c new file mode 100644 index 0000000..26a37f8 --- /dev/null +++ b/arch/i386/kernel/vmiclock.c @@ -0,0 +1,318 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2007, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/cpumask.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> + +#include <asm/vmi.h> +#include <asm/vmi_time.h> +#include <asm/arch_hooks.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/timer.h> + +#include <irq_vectors.h> +#include "io_ports.h" + +#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) +#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) + +static DEFINE_PER_CPU(struct clock_event_device, local_events); + +static inline u32 vmi_counter(u32 flags) +{ + /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding + * cycle counter. */ + return flags & VMI_ALARM_COUNTER_MASK; +} + +/* paravirt_ops.get_wallclock = vmi_get_wallclock */ +unsigned long vmi_get_wallclock(void) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec + (void)do_div(wallclock, 1000000000); // sec + + return wallclock; +} + +/* paravirt_ops.set_wallclock = vmi_set_wallclock */ +int vmi_set_wallclock(unsigned long now) +{ + return 0; +} + +/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ +unsigned long long vmi_get_sched_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ +unsigned long vmi_cpu_khz(void) +{ + unsigned long long khz; + khz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(khz, 1000); + return khz; +} + +static inline unsigned int vmi_get_timer_vector(void) +{ +#ifdef CONFIG_X86_IO_APIC + return FIRST_DEVICE_VECTOR; +#else + return FIRST_EXTERNAL_VECTOR; +#endif +} + +/** vmi clockchip */ +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int startup_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, vmi_get_timer_vector()); + + return (val & APIC_SEND_PENDING); +} + +static void mask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val | APIC_LVT_MASKED); +} + +static void unmask_timer_irq(unsigned int irq) +{ + unsigned long val = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); +} + +static void ack_timer_irq(unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip vmi_chip __read_mostly = { + .name = "VMI-LOCAL", + .startup = startup_timer_irq, + .mask = mask_timer_irq, + .unmask = unmask_timer_irq, + .ack = ack_timer_irq +}; +#endif + +/** vmi clockevent */ +#define VMI_ALARM_WIRED_IRQ0 0x00000000 +#define VMI_ALARM_WIRED_LVTT 0x00010000 +static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; + +static inline int vmi_get_alarm_wiring(void) +{ + return vmi_wiring; +} + +static void vmi_timer_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + cycle_t now, cycles_per_hz; + BUG_ON(!irqs_disabled()); + + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + break; + case CLOCK_EVT_MODE_PERIODIC: + cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_hz, HZ); + now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); + vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + switch (evt->mode) { + case CLOCK_EVT_MODE_ONESHOT: + vmi_timer_ops.cancel_alarm(VMI_ONESHOT); + break; + case CLOCK_EVT_MODE_PERIODIC: + vmi_timer_ops.cancel_alarm(VMI_PERIODIC); + break; + default: + break; + } + break; + default: + break; + } +} + +static int vmi_timer_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + /* Unfortunately, set_next_event interface only passes relative + * expiry, but we want absolute expiry. It'd be better if were + * were passed an aboslute expiry, since a bunch of time may + * have been stolen between the time the delta is computed and + * when we set the alarm below. */ + cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); + + BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); + return 0; +} + +static struct clock_event_device vmi_clockevent = { + .name = "vmi-timer", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .shift = 22, + .set_mode = vmi_timer_set_mode, + .set_next_event = vmi_timer_next_event, + .rating = 1000, + .irq = 0, +}; + +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(local_events); + evt->event_handler(evt); + return IRQ_HANDLED; +} + +static struct irqaction vmi_clock_action = { + .name = "vmi-timer", + .handler = vmi_timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .mask = CPU_MASK_ALL, +}; + +static void __devinit vmi_time_init_clockevent(void) +{ + cycle_t cycles_per_msec; + struct clock_event_device *evt; + + int cpu = smp_processor_id(); + evt = &__get_cpu_var(local_events); + + /* Use cycles_per_msec since div_sc params are 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + memcpy(evt, &vmi_clockevent, sizeof(*evt)); + /* Must pick .shift such that .mult fits in 32-bits. Choosing + * .shift to be 22 allows 2^(32-22) cycles per nano-seconds + * before overflow. */ + evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); + /* Upper bound is clockevent's use of ulong for cycle deltas. */ + evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); + evt->min_delta_ns = clockevent_delta2ns(1, evt); + evt->cpumask = cpumask_of_cpu(cpu); + + printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + evt->name, evt->mult, evt->shift); + clockevents_register_device(evt); +} + +void __init vmi_time_init(void) +{ + /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + vmi_time_init_clockevent(); + setup_irq(0, &vmi_clock_action); +} + +#ifdef CONFIG_X86_LOCAL_APIC +void __devinit vmi_time_bsp_init(void) +{ + /* + * On APIC systems, we want local timers to fire on each cpu. We do + * this by programming LVTT to deliver timer events to the IRQ handler + * for IRQ-0, since we can't re-use the APIC local timer handler + * without interfering with that code. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + local_irq_disable(); +#ifdef CONFIG_X86_SMP + /* + * XXX handle_percpu_irq only defined for SMP; we need to switch over + * to using it, since this is a local interrupt, which each CPU must + * handle individually without locking out or dropping simultaneous + * local timers on other CPUs. We also don't want to trigger the + * quirk workaround code for interrupts which gets invoked from + * handle_percpu_irq via eoi, so we use our own IRQ chip. + */ + set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); +#else + set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); +#endif + vmi_wiring = VMI_ALARM_WIRED_LVTT; + apic_write(APIC_LVTT, vmi_get_timer_vector()); + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); +} + +void __devinit vmi_time_ap_init(void) +{ + vmi_time_init_clockevent(); + apic_write(APIC_LVTT, vmi_get_timer_vector()); +} +#endif + +/** vmi clocksource */ + +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init init_vmi_clocksource(void) +{ + cycle_t cycles_per_msec; + + if (!vmi_timer_ops.get_cycle_frequency) + return 0; + /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ + cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); + (void)do_div(cycles_per_msec, 1000); + + /* Note that clocksource.{mult, shift} converts in the opposite direction + * as clockevents. */ + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + + printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); + return clocksource_register(&clocksource_vmi); + +} +module_init(init_vmi_clocksource); diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c deleted file mode 100644 index 9dfb177..0000000 --- a/arch/i386/kernel/vmitime.c +++ /dev/null @@ -1,482 +0,0 @@ -/* - * VMI paravirtual timer support routines. - * - * Copyright (C) 2005, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to dhecht@vmware.com - * - */ - -/* - * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. - * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/jiffies.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> -#include <linux/rcupdate.h> -#include <linux/clocksource.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/div64.h> -#include <asm/timer.h> -#include <asm/desc.h> - -#include <asm/vmi.h> -#include <asm/vmi_time.h> - -#include <mach_timer.h> -#include <io_ports.h> - -#ifdef CONFIG_X86_LOCAL_APIC -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT -#else -#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 -#endif - -/* Cached VMI operations */ -struct vmi_timer_ops vmi_timer_ops; - -#ifdef CONFIG_NO_IDLE_HZ - -/* /proc/sys/kernel/hz_timer state. */ -int sysctl_hz_timer; - -/* Some stats */ -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); -static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); -static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ -static int alarm_hz = CONFIG_VMI_ALARM_HZ; - -/* Cache of the value get_cycle_frequency / HZ. */ -static signed long long cycles_per_jiffy; - -/* Cache of the value get_cycle_frequency / alarm_hz. */ -static signed long long cycles_per_alarm; - -/* The number of cycles accounted for by the 'jiffies'/'xtime' count. - * Protected by xtime_lock. */ -static unsigned long long real_cycles_accounted_system; - -/* The number of cycles accounted for by update_process_times(), per cpu. */ -static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); - -/* The number of stolen cycles accounted, per cpu. */ -static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); - -/* Clock source. */ -static cycle_t read_real_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); -} - -static cycle_t read_available_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); -} - -#if 0 -static cycle_t read_stolen_cycles(void) -{ - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); -} -#endif /* 0 */ - -static struct clocksource clocksource_vmi = { - .name = "vmi-timer", - .rating = 450, - .read = read_real_cycles, - .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be set */ - .shift = 22, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - - -/* Timer interrupt handler. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); - -static struct irqaction vmi_timer_irq = { - .handler = vmi_timer_interrupt, - .flags = IRQF_DISABLED, - .mask = CPU_MASK_NONE, - .name = "VMI-alarm", -}; - -/* Alarm rate */ -static int __init vmi_timer_alarm_rate_setup(char* str) -{ - int alarm_rate; - if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { - alarm_hz = alarm_rate; - printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); - } - return 1; -} -__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); - - -/* Initialization */ -static void vmi_get_wallclock_ts(struct timespec *ts) -{ - unsigned long long wallclock; - wallclock = vmi_timer_ops.get_wallclock(); // nsec units - ts->tv_nsec = do_div(wallclock, 1000000000); - ts->tv_sec = wallclock; -} - -unsigned long vmi_get_wallclock(void) -{ - struct timespec ts; - vmi_get_wallclock_ts(&ts); - return ts.tv_sec; -} - -int vmi_set_wallclock(unsigned long now) -{ - return -1; -} - -unsigned long long vmi_get_sched_cycles(void) -{ - return read_available_cycles(); -} - -unsigned long vmi_cpu_khz(void) -{ - unsigned long long khz; - - khz = vmi_timer_ops.get_cycle_frequency(); - (void)do_div(khz, 1000); - return khz; -} - -void __init vmi_time_init(void) -{ - unsigned long long cycles_per_sec, cycles_per_msec; - unsigned long flags; - - local_irq_save(flags); - setup_irq(0, &vmi_timer_irq); -#ifdef CONFIG_X86_LOCAL_APIC - set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); -#endif - - real_cycles_accounted_system = read_real_cycles(); - per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); - - cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); - cycles_per_jiffy = cycles_per_sec; - (void)do_div(cycles_per_jiffy, HZ); - cycles_per_alarm = cycles_per_sec; - (void)do_div(cycles_per_alarm, alarm_hz); - cycles_per_msec = cycles_per_sec; - (void)do_div(cycles_per_msec, 1000); - - printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" - "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, - cycles_per_alarm); - - clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, - clocksource_vmi.shift); - if (clocksource_register(&clocksource_vmi)) - printk(KERN_WARNING "Error registering VMITIME clocksource."); - - /* Disable PIT. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ - - /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu - * reduce the latency calling update_process_times. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - - local_irq_restore(flags); -} - -#ifdef CONFIG_X86_LOCAL_APIC - -void __init vmi_timer_setup_boot_alarm(void) -{ - local_irq_disable(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, - cycles_per_alarm); - local_irq_enable(); -} - -/* Initialize the time accounting variables for an AP on an SMP system. - * Also, set the local alarm for the AP. */ -void __devinit vmi_timer_setup_secondary_alarm(void) -{ - int cpu = smp_processor_id(); - - /* Route the interrupt to the correct vector. */ - apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); - - per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); - - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); -} - -#endif - -/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ -static void vmi_account_real_cycles(unsigned long long cur_real_cycles) -{ - long long cycles_not_accounted; - - write_seqlock(&xtime_lock); - - cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; - while (cycles_not_accounted >= cycles_per_jiffy) { - /* systems wide jiffies. */ - do_timer(1); - - cycles_not_accounted -= cycles_per_jiffy; - real_cycles_accounted_system += cycles_per_jiffy; - } - - write_sequnlock(&xtime_lock); -} - -/* Update per-cpu process times. */ -static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - /* Account time to the current process. This includes - * calling into the scheduler to decrement the timeslice - * and possibly reschedule.*/ - update_process_times(user_mode(regs)); - /* XXX handle /proc/profile multiplier. */ - profile_tick(CPU_PROFILING); - - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } -} - -#ifdef CONFIG_NO_IDLE_HZ -/* Update per-cpu idle times. Used when a no-hz halt is ended. */ -static void vmi_account_no_hz_idle_cycles(int cpu, - unsigned long long cur_process_times_cycles) -{ - long long cycles_not_accounted; - unsigned long no_idle_hz_jiffies = 0; - - cycles_not_accounted = cur_process_times_cycles - - per_cpu(process_times_cycles_accounted_cpu, cpu); - - while (cycles_not_accounted >= cycles_per_jiffy) { - no_idle_hz_jiffies++; - cycles_not_accounted -= cycles_per_jiffy; - per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* Account time to the idle process. */ - account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); -} -#endif - -/* Update per-cpu stolen time. */ -static void vmi_account_stolen_cycles(int cpu, - unsigned long long cur_real_cycles, - unsigned long long cur_avail_cycles) -{ - long long stolen_cycles_not_accounted; - unsigned long stolen_jiffies = 0; - - if (cur_real_cycles < cur_avail_cycles) - return; - - stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - - per_cpu(stolen_cycles_accounted_cpu, cpu); - - while (stolen_cycles_not_accounted >= cycles_per_jiffy) { - stolen_jiffies++; - stolen_cycles_not_accounted -= cycles_per_jiffy; - per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; - } - /* HACK: pass NULL to force time onto cpustat->steal. */ - account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); -} - -/* Body of either IRQ0 interrupt handler (UP no local-APIC) or - * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ -static void vmi_local_timer_interrupt(int cpu) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu process times. */ - vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); -} - -#ifdef CONFIG_NO_IDLE_HZ - -/* Must be called only from idle loop, with interrupts disabled. */ -int vmi_stop_hz_timer(void) -{ - /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ - - unsigned long seq, next; - unsigned long long real_cycles_expiry; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - if (sysctl_hz_timer != 0) - return 0; - - cpu_set(cpu, nohz_cpu_mask); - smp_mb(); - - if (rcu_needs_cpu(cpu) || local_softirq_pending() || - (next = next_timer_interrupt(), - time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) { - cpu_clear(cpu, nohz_cpu_mask); - return 0; - } - - /* Convert jiffies to the real cycle counter. */ - do { - seq = read_seqbegin(&xtime_lock); - real_cycles_expiry = real_cycles_accounted_system + - (long)(next - jiffies) * cycles_per_jiffy; - } while (read_seqretry(&xtime_lock, seq)); - - /* This cpu is going idle. Disable the periodic alarm. */ - vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); - per_cpu(idle_start_jiffies, cpu) = jiffies; - /* Set the real time alarm to expire at the next event. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, - real_cycles_expiry, 0); - return 1; -} - -static void vmi_reenable_hz_timer(int cpu) -{ - /* For /proc/vmi/info idle_hz stat. */ - per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); - per_cpu(vmi_idle_no_hz_irqs, cpu)++; - - /* Don't bother explicitly cancelling the one-shot alarm -- at - * worse we will receive a spurious timer interrupt. */ - vmi_timer_ops.set_alarm( - VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, - per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, - cycles_per_alarm); - /* Indicate this cpu is no longer nohz idle. */ - cpu_clear(cpu, nohz_cpu_mask); -} - -/* Called from interrupt handlers when (local) HZ timer is disabled. */ -void vmi_account_time_restart_hz_timer(void) -{ - unsigned long long cur_real_cycles, cur_process_times_cycles; - int cpu = smp_processor_id(); - - BUG_ON(!irqs_disabled()); - /* Account the time during which the HZ timer was disabled. */ - cur_real_cycles = read_real_cycles(); - cur_process_times_cycles = read_available_cycles(); - /* Update system wide (real) time state (xtime, jiffies). */ - vmi_account_real_cycles(cur_real_cycles); - /* Update per-cpu idle times. */ - vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); - /* Update time stolen from this cpu by the hypervisor. */ - vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); - /* Reenable the hz timer. */ - vmi_reenable_hz_timer(cpu); -} - -#endif /* CONFIG_NO_IDLE_HZ */ - -/* UP (and no local-APIC) VMI-timer alarm interrupt handler. - * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after - * APIC setup and setup_boot_vmi_alarm() is called. */ -static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) -{ - vmi_local_timer_interrupt(smp_processor_id()); - return IRQ_HANDLED; -} - -#ifdef CONFIG_X86_LOCAL_APIC - -/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. - * Also used in UP when CONFIG_X86_LOCAL_APIC. - * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ -void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat,cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - vmi_local_timer_interrupt(cpu); - irq_exit(); - set_irq_regs(old_regs); -} - -#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6f38f81..23e8614 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; -_proxy_pda = 1; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(4); /* R__ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { @@ -61,8 +60,6 @@ SECTIONS __stop___ex_table = .; } - RODATA - BUG_TABLE . = ALIGN(4); @@ -72,6 +69,8 @@ SECTIONS __tracedata_end = .; } + RODATA + /* writeable */ . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ @@ -117,22 +116,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - __smp_alt_begin = .; - __smp_alt_instructions = .; - *(.smp_altinstructions) - __smp_alt_instructions_end = .; - } - . = ALIGN(4); .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) __smp_locks_end = .; } - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - __smp_alt_end = .; - } /* will be freed after init * Following ALIGN() is required to make sure no other data falls on the * same page where __smp_alt_end is pointing as that page might be freed @@ -178,9 +166,9 @@ SECTIONS } . = ALIGN(4); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __start_parainstructions = .; + __parainstructions = .; *(.parainstructions) - __stop_parainstructions = .; + __parainstructions_end = .; } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ @@ -194,7 +182,7 @@ SECTIONS __initramfs_end = .; } #endif - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(4096); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index f66cd11..4a8b0ed 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,7 +7,7 @@ SECTIONS { - . = VDSO_PRELINK + SIZEOF_HEADERS; + . = VDSO_PRELINK_asm + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -21,7 +21,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + 0x400; + . = VDSO_PRELINK_asm + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note |