diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/mm/extable.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 446 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/iomap_32.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 217 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 1 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 296 |
12 files changed, 906 insertions, 242 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d8cc96a..9f05157 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,6 +1,8 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o +obj-$(CONFIG_X86_SMP) += tlb.o + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 7e8db53..61b41ca 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { + /* If fixup is less than 16, it means uaccess error */ + if (fixup->fixup < 16) { + current_thread_info()->uaccess_err = -EFAULT; + regs->ip += fixup->fixup; + return 1; + } regs->ip = fixup->fixup; return 1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae5..d3eee74 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -26,6 +26,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> +#include <linux/magic.h> #include <asm/system.h> #include <asm/desc.h> @@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { unsigned char *instr; int scan_more = 1; @@ -409,17 +410,16 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { unsigned long flags = oops_begin(); int sig = SIGKILL; - struct task_struct *tsk; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; @@ -429,6 +429,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + unsigned long *stackend; + +#ifdef CONFIG_X86_64 + unsigned long flags; + int sig; +#endif + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); +#endif +} + +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); +} + +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, error_code, address)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) + out_of_memory(regs, error_code, address); + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); +} + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) @@ -448,8 +638,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -494,7 +684,7 @@ static int spurious_fault(unsigned long address, * * This assumes no large pages in there. */ -static int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { #ifdef CONFIG_X86_32 unsigned long pgd_paddr; @@ -573,6 +763,25 @@ static int vmalloc_fault(unsigned long address) int show_unhandled_signals = 1; +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ + return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } + + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -583,16 +792,12 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long address; - int write, si_code; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif tsk = current; mm = tsk->mm; @@ -601,9 +806,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ address = read_cr2(); - si_code = SEGV_MAPERR; - - if (notify_page_fault(regs)) + if (unlikely(notify_page_fault(regs))) return; if (unlikely(kmmio_fault(regs, address))) return; @@ -631,17 +834,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + if (spurious_fault(error_code, address)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ - goto bad_area_nosemaphore; + bad_area_nosemaphore(regs, error_code, address); + return; } - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -657,15 +860,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); + pgtable_bad(regs, error_code, address); #endif /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to @@ -683,20 +888,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. @@ -704,31 +915,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* @@ -738,11 +943,8 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; @@ -760,128 +962,6 @@ good_area: } #endif up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } DEFINE_SPINLOCK(pgd_lock); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 88f1b10..00263bf 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,7 +49,6 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> -#include <asm/smp.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -138,6 +137,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end + && ((__pa(pte) >> PAGE_SHIFT) < table_start + || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + pte_t *newpte; + int i; + + BUG_ON(after_init_bootmem); + newpte = alloc_low_page(); + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + + paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + /* * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in @@ -154,6 +194,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) unsigned long vaddr; pgd_t *pgd; pmd_t *pmd; + pte_t *pte = NULL; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -165,7 +206,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - one_page_table_init(pmd); + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte); vaddr += PMD_SIZE; } @@ -508,7 +550,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ - early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); @@ -801,7 +842,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse) tables += PAGE_ALIGN(ptes * sizeof(pte_t)); /* for fixmap */ - tables += PAGE_SIZE * 2; + tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); /* * RED-PEN putting page tables only on node 0 could diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 23f68e7..e6d36b4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -596,7 +596,7 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -static unsigned long __init kernel_physical_mapping_init(unsigned long start, +static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index d0151d8..ca53224 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -17,6 +17,7 @@ */ #include <asm/iomap.h> +#include <asm/pat.h> #include <linux/module.h> /* Map 'pfn' using fixed map 'type' and protections 'prot' @@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) pagefault_disable(); + /* + * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. + * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the + * MTRR is UC or WC. UC_MINUS gets the real intention, of the + * user, which is "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) + prot = PAGE_KERNEL_UC_MINUS; + idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bd85d42..1448bcb 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache); * * Must be freed with iounmap. */ -void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, @@ -557,34 +557,9 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_clear(void) -{ - pmd_t *pmd; - - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_clear()\n"); - - pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); - pmd_clear(pmd); - paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); - __flush_tlb_all(); -} - void __init early_ioremap_reset(void) { - enum fixed_addresses idx; - unsigned long addr, phys; - pte_t *pte; - after_paging_init = 1; - for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { - addr = fix_to_virt(idx); - pte = early_ioremap_pte(addr); - if (pte_present(*pte)) { - phys = pte_val(*pte) & PAGE_MASK; - set_fixmap(idx, phys); - } - } } static void __init __early_set_fixmap(enum fixed_addresses idx, diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f8..08d140f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,6 +20,12 @@ #include <asm/acpi.h> #include <asm/k8.h> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -33,6 +39,21 @@ int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +DEFINE_PER_CPU(int, node_number) = 0; +EXPORT_PER_CPU_SYMBOL(node_number); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* + * Which logical CPUs are on which nodes + */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -640,3 +661,199 @@ void __init init_cpu_to_node(void) #endif +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + per_cpu(node_number, cpu) = node; +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + + +/* empty cpumask */ +static const cpumask_t cpu_mask_none; + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const cpumask_t *cpumask_of_node(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return (const cpumask_t *)&cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return &cpu_mask_none; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +/* + * Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "node_to_cpumask(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_mask_none; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e89d248..84ba748 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -534,6 +534,36 @@ out_unlock: return 0; } +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + /* + * Ignore all non primary paths. + */ + if (!primary) + return 0; + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; @@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) repeat: kpte = lookup_address(address, &level); if (!kpte) - return 0; + return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; - if (!pte_val(old_pte)) { - if (!primary) - return 0; - WARN(1, KERN_WARNING "CPA: called for zero pte. " - "vaddr = %lx cpa->vaddr = %lx\n", address, - *cpa->vaddr); - return -EINVAL; - } + if (!pte_val(old_pte)) + return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; @@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa) vaddr = *cpa->vaddr; if (!(within(vaddr, PAGE_OFFSET, - PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) -#ifdef CONFIG_X86_64 - || within(vaddr, PAGE_OFFSET + (1UL<<32), - PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) -#endif - )) { + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 430cb44..9127e31 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -342,11 +342,23 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, req_type & _PAGE_CACHE_MASK); } - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, new_type); - else if (is_range_ram < 0) - return -EINVAL; + if (new_type) + *new_type = actual_type; + + /* + * For legacy reasons, some parts of the physical address range in the + * legacy 1MB region is treated as non-RAM (even when listed as RAM in + * the e820 tables). So we will track the memory attributes of this + * legacy 1MB region using the linear memtype_list always. + */ + if (end >= ISA_END_ADDRESS) { + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, + new_type); + else if (is_range_ram < 0) + return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -356,9 +368,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, new->end = end; new->type = actual_type; - if (new_type) - *new_type = actual_type; - spin_lock(&memtype_lock); if (cached_entry && start >= cached_start) @@ -446,11 +455,19 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; - is_range_ram = pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) - return -EINVAL; + /* + * For legacy reasons, some parts of the physical address range in the + * legacy 1MB region is treated as non-RAM (even when listed as RAM in + * the e820 tables). So we will track the memory attributes of this + * legacy 1MB region using the linear memtype_list always. + */ + if (end >= ISA_END_ADDRESS) { + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; + } spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 09737c8..15df1ba 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -21,6 +21,7 @@ #include <asm/numa.h> #include <asm/e820.h> #include <asm/genapic.h> +#include <asm/uv/uv.h> int acpi_numa __initdata; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 0000000..72a6d4e --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,296 @@ +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/module.h> + +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + +#include <mach_ipi.h> +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right array slot for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + struct mm_struct *flush_mm; + unsigned long flush_va; + spinlock_t tlbstate_lock; + DECLARE_BITMAP(flush_cpumask, NR_CPUS); + }; + char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; +} ____cacheline_internodealigned_in_smp; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +/* + * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop + * but still used for documentation purpose but the usage is slightly + * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt + * entry calls in with the first parameter in %eax. Maybe define + * intrlinkage? + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned int cpu; + unsigned int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; + f = &flush_state[sender]; + + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); + smp_mb__after_clear_bit(); + inc_irq_stat(irq_tlb_count); +} + +static void flush_tlb_others_ipi(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + unsigned int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &flush_state[sender]; + + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpumask_andnot(to_cpumask(f->flush_cpumask), + cpumask, cpumask_of(smp_processor_id())); + + /* + * Make the above memory operations globally visible before + * sending the IPI. + */ + smp_mb(); + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + if (is_uv_system()) { + unsigned int cpu; + + cpu = get_cpu(); + cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + if (cpumask) + flush_tlb_others_ipi(cpumask, mm, va); + put_cpu(); + return; + } + flush_tlb_others_ipi(cpumask, mm, va); +} + +static int __cpuinit init_smp_flush(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(flush_state); i++) + spin_lock_init(&flush_state[i].tlbstate_lock); + + return 0; +} +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + local_flush_tlb(); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} |