6 files changed, 201 insertions, 150 deletions
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 91c7b86..7699394 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -253,45 +253,33 @@ good_area:
 #endif /* CONFIG_8xx */
 
 	if (is_exec) {
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-		/* protection fault */
+#ifdef CONFIG_PPC_STD_MMU
+		/* Protection fault on exec go straight to failure on
+		 * Hash based MMUs as they either don't support per-page
+		 * execute permission, or if they do, it's handled already
+		 * at the hash level. This test would probably have to
+		 * be removed if we change the way this works to make hash
+		 * processors use the same I/D cache coherency mechanism
+		 * as embedded.
+		 */
 		if (error_code & DSISR_PROTFAULT)
 			goto bad_area;
+#endif /* CONFIG_PPC_STD_MMU */
+
 		/*
 		 * Allow execution from readable areas if the MMU does not
 		 * provide separate controls over reading and executing.
+		 *
+		 * Note: That code used to not be enabled for 4xx/BookE.
+		 * It is now as I/D cache coherency for these is done at
+		 * set_pte_at() time and I see no reason why the test
+		 * below wouldn't be valid on those processors. This -may-
+		 * break programs compiled with a really old ABI though.
 		 */
 		if (!(vma->vm_flags & VM_EXEC) &&
 		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
 			goto bad_area;
-#else
-		pte_t *ptep;
-		pmd_t *pmdp;
-
-		/* Since 4xx/Book-E supports per-page execute permission,
-		 * we lazily flush dcache to icache. */
-		ptep = NULL;
-		if (get_pteptr(mm, address, &ptep, &pmdp)) {
-			spinlock_t *ptl = pte_lockptr(mm, pmdp);
-			spin_lock(ptl);
-			if (pte_present(*ptep)) {
-				struct page *page = pte_page(*ptep);
-
-				if (!test_bit(PG_arch_1, &page->flags)) {
-					flush_dcache_icache_page(page);
-					set_bit(PG_arch_1, &page->flags);
-				}
-				pte_update(ptep, 0, _PAGE_HWEXEC |
-					   _PAGE_ACCESSED);
-				local_flush_tlb_page(vma, address);
-				pte_unmap_unlock(ptep, ptl);
-				up_read(&mm->mmap_sem);
-				return 0;
-			}
-			pte_unmap_unlock(ptep, ptl);
-		}
-#endif
 	/* a write */
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index ea6e41e..985b6c3 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -56,10 +56,14 @@
 
 extern void loadcam_entry(unsigned int index);
 unsigned int tlbcam_index;
-static unsigned long __cam0, __cam1, __cam2;
+static unsigned long cam[CONFIG_LOWMEM_CAM_NUM];
 
 #define NUM_TLBCAMS	(16)
 
+#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS)
+#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
+#endif
+
 struct tlbcam TLBCAM[NUM_TLBCAMS];
 
 struct tlbcamrange {
@@ -107,7 +111,7 @@ void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	unsigned int tsize, lz;
 
 	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (size));
-	tsize = (21 - lz) / 2;
+	tsize = 21 - lz;
 
 #ifdef CONFIG_SMP
 	if ((flags & _PAGE_NO_CACHE) == 0)
@@ -152,19 +156,19 @@ void invalidate_tlbcam_entry(int index)
 	loadcam_entry(index);
 }
 
-void __init cam_mapin_ram(unsigned long cam0, unsigned long cam1,
-		unsigned long cam2)
+unsigned long __init mmu_mapin_ram(void)
 {
-	settlbcam(0, PAGE_OFFSET, memstart_addr, cam0, _PAGE_KERNEL, 0);
-	tlbcam_index++;
-	if (cam1) {
-		tlbcam_index++;
-		settlbcam(1, PAGE_OFFSET+cam0, memstart_addr+cam0, cam1, _PAGE_KERNEL, 0);
-	}
-	if (cam2) {
+	unsigned long virt = PAGE_OFFSET;
+	phys_addr_t phys = memstart_addr;
+
+	while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) {
+		settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], _PAGE_KERNEL, 0);
+		virt += cam[tlbcam_index];
+		phys += cam[tlbcam_index];
 		tlbcam_index++;
-		settlbcam(2, PAGE_OFFSET+cam0+cam1, memstart_addr+cam0+cam1, cam2, _PAGE_KERNEL, 0);
 	}
+
+	return virt - PAGE_OFFSET;
 }
 
 /*
@@ -175,51 +179,46 @@ void __init MMU_init_hw(void)
 	flush_instruction_cache();
 }
 
-unsigned long __init mmu_mapin_ram(void)
-{
-	cam_mapin_ram(__cam0, __cam1, __cam2);
-
-	return __cam0 + __cam1 + __cam2;
-}
-
-
 void __init
 adjust_total_lowmem(void)
 {
-	phys_addr_t max_lowmem_size = __max_low_memory;
-	phys_addr_t cam_max_size = 0x10000000;
 	phys_addr_t ram;
+	unsigned int max_cam = (mfspr(SPRN_TLB1CFG) >> 16) & 0xff;
+	char buf[ARRAY_SIZE(cam) * 5 + 1], *p = buf;
+	int i;
+	unsigned long virt = PAGE_OFFSET & 0xffffffffUL;
+	unsigned long phys = memstart_addr & 0xffffffffUL;
 
-	/* adjust CAM size to max_lowmem_size */
-	if (max_lowmem_size < cam_max_size)
-		cam_max_size = max_lowmem_size;
+	/* Convert (4^max) kB to (2^max) bytes */
+	max_cam = max_cam * 2 + 10;
 
-	/* adjust lowmem size to max_lowmem_size */
-	ram = min(max_lowmem_size, total_lowmem);
+	/* adjust lowmem size to __max_low_memory */
+	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
 	/* Calculate CAM values */
-	__cam0 = 1UL << 2 * (__ilog2(ram) / 2);
-	if (__cam0 > cam_max_size)
-		__cam0 = cam_max_size;
-	ram -= __cam0;
-	if (ram) {
-		__cam1 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam1 > cam_max_size)
-			__cam1 = cam_max_size;
-		ram -= __cam1;
-	}
-	if (ram) {
-		__cam2 = 1UL << 2 * (__ilog2(ram) / 2);
-		if (__cam2 > cam_max_size)
-			__cam2 = cam_max_size;
-		ram -= __cam2;
+	__max_low_memory = 0;
+	for (i = 0; ram && i < ARRAY_SIZE(cam); i++) {
+		unsigned int camsize = __ilog2(ram) & ~1U;
+		unsigned int align = __ffs(virt | phys) & ~1U;
+
+		if (camsize > align)
+			camsize = align;
+		if (camsize > max_cam)
+			camsize = max_cam;
+
+		cam[i] = 1UL << camsize;
+		ram -= cam[i];
+		__max_low_memory += cam[i];
+		virt += cam[i];
+		phys += cam[i];
+
+		p += sprintf(p, "%lu/", cam[i] >> 20);
 	}
+	for (; i < ARRAY_SIZE(cam); i++)
+		p += sprintf(p, "0/");
+	p[-1] = '\0';
 
-	printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb,"
-			" CAM2=%ldMb residual: %ldMb\n",
-			__cam0 >> 20, __cam1 >> 20, __cam2 >> 20,
-			(long int)((total_lowmem - __cam0 - __cam1 - __cam2)
-				   >> 20));
-	__max_low_memory = __cam0 + __cam1 + __cam2;
+	pr_info("Memory CAM mapping: %s Mb, residual: %dMb\n", buf,
+	        (unsigned int)((total_lowmem - __max_low_memory) >> 20));
 	__initial_memory_limit_addr = memstart_addr + __max_low_memory;
 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f00f09a..f668fa9 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -472,40 +472,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 {
 #ifdef CONFIG_PPC_STD_MMU
 	unsigned long access = 0, trap;
-#endif
-	unsigned long pfn = pte_pfn(pte);
-
-	/* handle i-cache coherency */
-	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
-	    !cpu_has_feature(CPU_FTR_NOEXECUTE) &&
-	    pfn_valid(pfn)) {
-		struct page *page = pfn_to_page(pfn);
-#ifdef CONFIG_8xx
-		/* On 8xx, cache control instructions (particularly
-		 * "dcbst" from flush_dcache_icache) fault as write
-		 * operation if there is an unpopulated TLB entry
-		 * for the address in question. To workaround that,
-		 * we invalidate the TLB here, thus avoiding dcbst
-		 * misbehaviour.
-		 */
-		_tlbil_va(address, 0 /* 8xx doesn't care about PID */);
-#endif
-		/* The _PAGE_USER test should really be _PAGE_EXEC, but
-		 * older glibc versions execute some code from no-exec
-		 * pages, which for now we are supporting.  If exec-only
-		 * pages are ever implemented, this will have to change.
-		 */
-		if (!PageReserved(page) && (pte_val(pte) & _PAGE_USER)
-		    && !test_bit(PG_arch_1, &page->flags)) {
-			if (vma->vm_mm == current->active_mm) {
-				__flush_dcache_icache((void *) address);
-			} else
-				flush_dcache_icache_page(page);
-			set_bit(PG_arch_1, &page->flags);
-		}
-	}
 
-#ifdef CONFIG_PPC_STD_MMU
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(pte) || address >= TASK_SIZE)
 		return;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 86010fc..7db8abc 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -48,12 +48,6 @@ static inline unsigned long mmap_base(void)
 
 static inline int mmap_is_legacy(void)
 {
-	/*
-	 * Force standard allocation for 64 bit programs.
-	 */
-	if (!test_thread_flag(TIF_32BIT))
-		return 1;
-
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 7393bd7..0507faa 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -19,6 +19,7 @@
 #include <linux/notifier.h>
 #include <linux/lmb.h>
 #include <linux/of.h>
+#include <linux/pfn.h>
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/system.h>
@@ -157,35 +158,6 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static struct device_node * __cpuinit find_cpu_node(unsigned int cpu)
-{
-	unsigned int hw_cpuid = get_hard_smp_processor_id(cpu);
-	struct device_node *cpu_node = NULL;
-	const unsigned int *interrupt_server, *reg;
-	int len;
-
-	while ((cpu_node = of_find_node_by_type(cpu_node, "cpu")) != NULL) {
-		/* Try interrupt server first */
-		interrupt_server = of_get_property(cpu_node,
-					"ibm,ppc-interrupt-server#s", &len);
-
-		len = len / sizeof(u32);
-
-		if (interrupt_server && (len > 0)) {
-			while (len--) {
-				if (interrupt_server[len] == hw_cpuid)
-					return cpu_node;
-			}
-		} else {
-			reg = of_get_property(cpu_node, "reg", &len);
-			if (reg && (len > 0) && (reg[0] == hw_cpuid))
-				return cpu_node;
-		}
-	}
-
-	return NULL;
-}
-
 /* must hold reference to node during call */
 static const int *of_get_associativity(struct device_node *dev)
 {
@@ -289,7 +261,7 @@ static int __init find_min_common_depth(void)
 	ref_points = of_get_property(rtas_root,
 			"ibm,associativity-reference-points", &len);
 
-	if ((len >= 1) && ref_points) {
+	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
 		depth = ref_points[1];
 	} else {
 		dbg("NUMA: ibm,associativity-reference-points not found.\n");
@@ -469,7 +441,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 {
 	int nid = 0;
-	struct device_node *cpu = find_cpu_node(lcpu);
+	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
 
 	if (!cpu) {
 		WARN_ON(1);
@@ -651,7 +623,7 @@ static int __init parse_numa_properties(void)
 	for_each_present_cpu(i) {
 		int nid;
 
-		cpu = find_cpu_node(i);
+		cpu = of_get_cpu_node(i, NULL);
 		BUG_ON(!cpu);
 		nid = of_node_to_nid_single(cpu);
 		of_node_put(cpu);
@@ -882,7 +854,7 @@ static void mark_reserved_regions_for_nid(int nid)
 		unsigned long physbase = lmb.reserved.region[i].base;
 		unsigned long size = lmb.reserved.region[i].size;
 		unsigned long start_pfn = physbase >> PAGE_SHIFT;
-		unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
+		unsigned long end_pfn = PFN_UP(physbase + size);
 		struct node_active_region node_ar;
 		unsigned long node_end_pfn = node->node_start_pfn +
 					     node->node_spanned_pages;
@@ -908,7 +880,7 @@ static void mark_reserved_regions_for_nid(int nid)
 			 */
 			if (end_pfn > node_ar.end_pfn)
 				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
-					- (start_pfn << PAGE_SHIFT);
+					- physbase;
 			/*
 			 * Only worry about *this* node, others may not
 			 * yet have valid NODE_DATA().
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 6d94116..a27ded3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -1,5 +1,6 @@
 /*
  * This file contains common routines for dealing with free of page tables
+ * Along with common page table handling code
  *
  *  Derived from arch/powerpc/mm/tlb_64.c:
  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -115,3 +116,133 @@ void pte_free_finish(void)
 	pte_free_submit(*batchp);
 	*batchp = NULL;
 }
+
+/*
+ * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags()
+ */
+static pte_t do_dcache_icache_coherency(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+	struct page *page;
+
+	if (unlikely(!pfn_valid(pfn)))
+		return pte;
+	page = pfn_to_page(pfn);
+
+	if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) {
+		pr_debug("do_dcache_icache_coherency... flushing\n");
+		flush_dcache_icache_page(page);
+		set_bit(PG_arch_1, &page->flags);
+	}
+	else
+		pr_debug("do_dcache_icache_coherency... already clean\n");
+	return __pte(pte_val(pte) | _PAGE_HWEXEC);
+}
+
+static inline int is_exec_fault(void)
+{
+	return current->thread.regs && TRAP(current->thread.regs) == 0x400;
+}
+
+/* We only try to do i/d cache coherency on stuff that looks like
+ * reasonably "normal" PTEs. We currently require a PTE to be present
+ * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE
+ */
+static inline int pte_looks_normal(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) ==
+		(_PAGE_PRESENT);
+}
+
+#if defined(CONFIG_PPC_STD_MMU)
+/* Server-style MMU handles coherency when hashing if HW exec permission
+ * is supposed per page (currently 64-bit only). Else, we always flush
+ * valid PTEs in set_pte.
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return set_pte && pte_looks_normal(pte) &&
+		!(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
+		  cpu_has_feature(CPU_FTR_NOEXECUTE));
+}
+#elif _PAGE_HWEXEC == 0
+/* Embedded type MMU without HW exec support (8xx only so far), we flush
+ * the cache for any present PTE
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return set_pte && pte_looks_normal(pte);
+}
+#else
+/* Other embedded CPUs with HW exec support per-page, we flush on exec
+ * fault if HWEXEC is not set
+ */
+static inline int pte_need_exec_flush(pte_t pte, int set_pte)
+{
+	return pte_looks_normal(pte) && is_exec_fault() &&
+		!(pte_val(pte) & _PAGE_HWEXEC);
+}
+#endif
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ */
+void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+#ifdef CONFIG_DEBUG_VM
+	WARN_ON(pte_present(*ptep));
+#endif
+	/* Note: mm->context.id might not yet have been assigned as
+	 * this context might not have been activated yet when this
+	 * is called.
+	 */
+	pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+	if (pte_need_exec_flush(pte, 1))
+		pte = do_dcache_icache_coherency(pte);
+
+	/* Perform the setting of the PTE */
+	__set_pte_at(mm, addr, ptep, pte, 0);
+}
+
+/*
+ * This is called when relaxing access to a PTE. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pte_t *ptep, pte_t entry, int dirty)
+{
+	int changed;
+	if (!dirty && pte_need_exec_flush(entry, 0))
+		entry = do_dcache_icache_coherency(entry);
+	changed = !pte_same(*(ptep), entry);
+	if (changed) {
+		assert_pte_locked(vma->vm_mm, address);
+		__ptep_set_access_flags(ptep, entry);
+		flush_tlb_page_nohash(vma, address);
+	}
+	return changed;
+}
+
+#ifdef CONFIG_DEBUG_VM
+void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (mm == &init_mm)
+		return;
+	pgd = mm->pgd + pgd_index(addr);
+	BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd, addr);
+	BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(!pmd_present(*pmd));
+	BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd)));
+}
+#endif /* CONFIG_DEBUG_VM */
+