From 725d704ecaca4a43f067092c140d4f3271cf2856 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:30:55 -0700
Subject: [PATCH] mm: VM_BUG_ON

Introduce a VM_BUG_ON, which is turned on with CONFIG_DEBUG_VM.  Use this
in the lightweight, inline refcounting functions; PageLRU and PageActive
checks in vmscan, because they're pretty well confined to vmscan.  And in
page allocate/free fastpaths which can be the hottest parts of the kernel
for kbuilds.

Unlike BUG_ON, VM_BUG_ON must not be used to execute statements with
side-effects, and should not be used outside core mm code.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 224178a..7d20b25 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -278,6 +278,12 @@ struct page {
  */
 #include <linux/page-flags.h>
 
+#ifdef CONFIG_DEBUG_VM
+#define VM_BUG_ON(cond) BUG_ON(cond)
+#else
+#define VM_BUG_ON(condition) do { } while(0)
+#endif
+
 /*
  * Methods to modify the page usage count.
  *
@@ -297,7 +303,7 @@ struct page {
  */
 static inline int put_page_testzero(struct page *page)
 {
-	BUG_ON(atomic_read(&page->_count) == 0);
+	VM_BUG_ON(atomic_read(&page->_count) == 0);
 	return atomic_dec_and_test(&page->_count);
 }
 
@@ -307,6 +313,7 @@ static inline int put_page_testzero(struct page *page)
  */
 static inline int get_page_unless_zero(struct page *page)
 {
+	VM_BUG_ON(PageCompound(page));
 	return atomic_inc_not_zero(&page->_count);
 }
 
@@ -323,6 +330,7 @@ static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
+	VM_BUG_ON(atomic_read(&page->_count) == 0);
 	atomic_inc(&page->_count);
 }
 
-- 
cgit v1.1


From d08b3851da41d0ee60851f2c75b118e1f7a5fc89 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:30:57 -0700
Subject: [PATCH] mm: tracking shared dirty pages

Tracking of dirty pages in shared writeable mmap()s.

The idea is simple: write protect clean shared writeable pages, catch the
write-fault, make writeable and set dirty.  On page write-back clean all the
PTE dirty bits and write protect them once again.

The implementation is a tad harder, mainly because the default
backing_dev_info capabilities were too loosely maintained.  Hence it is not
enough to test the backing_dev_info for cap_account_dirty.

The current heuristic is as follows, a VMA is eligible when:
 - its shared writeable
    (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)
 - it is not a 'special' mapping
    (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0
 - the backing_dev_info is cap_account_dirty
    mapping_cap_account_dirty(vma->vm_file->f_mapping)
 - f_op->mmap() didn't change the default page protection

Page from remap_pfn_range() are explicitly excluded because their COW
semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and
because they don't have a backing store anyway.

mprotect() is taught about the new behaviour as well.  However it overrides
the last condition.

Cleaning the pages on write-back is done with page_mkclean() a new rmap call.
It can be called on any page, but is currently only implemented for mapped
pages, if the page is found the be of a VMA that accounts dirty pages it will
also wrprotect the PTE.

Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from
under ->private_lock.  This seems to be safe, since ->private_lock is used to
serialize access to the buffers, not the page itself.  This is needed because
clear_page_dirty() will call into page_mkclean() and would thereby violate
locking order.

[dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d20b25..4498414 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/debug_locks.h>
+#include <linux/backing-dev.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -810,6 +811,39 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+static inline int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+	unsigned int vm_flags = vma->vm_flags;
+
+	/* If it was private or non-writable, the write bit is already clear */
+	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+		return 0;
+
+	/* The backer wishes to know when pages are first written to? */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		return 1;
+
+	/* The open routine did something to the protections already? */
+	if (pgprot_val(vma->vm_page_prot) !=
+	    pgprot_val(protection_map[vm_flags &
+		    (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
+		return 0;
+
+	/* Specialty mapping? */
+	if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+		return 0;
+
+	/* Can the mapping track the dirty pages? */
+	return vma->vm_file && vma->vm_file->f_mapping &&
+		mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
 extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
 
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
-- 
cgit v1.1


From b221385bc41d6789edde3d2fa0cb20d5045730eb Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 25 Sep 2006 23:31:02 -0700
Subject: [PATCH] mm/: make functions static

This patch makes the following needlessly global functions static:
 - slab.c: kmem_find_general_cachep()
 - swap.c: __page_cache_release()
 - vmalloc.c: __vmalloc_node()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4498414..45678b0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -318,8 +318,6 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
-extern void FASTCALL(__page_cache_release(struct page *));
-
 static inline int page_count(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-- 
cgit v1.1


From 2f1b6248682f8b39ca3c7e549dfc216d26c4109b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:13 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: use enum to define zones, reformat and
 comment

Use enum for zones and reformat zones dependent information

Add comments explaning the use of zones and add a zones_t type for zone
numbers.

Line up information that will be #ifdefd by the following patches.

[akpm@osdl.org: comment cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 45678b0..2db4229 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -470,7 +470,7 @@ void split_page(struct page *page, unsigned int order);
 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
-static inline unsigned long page_zonenum(struct page *page)
+static inline enum zone_type page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
@@ -499,11 +499,12 @@ static inline unsigned long page_to_section(struct page *page)
 	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long zone)
+static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
 	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
 }
+
 static inline void set_page_node(struct page *page, unsigned long node)
 {
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
@@ -515,7 +516,7 @@ static inline void set_page_section(struct page *page, unsigned long section)
 	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
 }
 
-static inline void set_page_links(struct page *page, unsigned long zone,
+static inline void set_page_links(struct page *page, enum zone_type zone,
 	unsigned long node, unsigned long pfn)
 {
 	set_page_zone(page, zone);
-- 
cgit v1.1


From da6052f7b33abe55fbfd7d2213815f58c00a88d4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:35 -0700
Subject: [PATCH] update some mm/ comments

Let's try to keep mm/ comments more useful and up to date. This is a start.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 68 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2db4229..f201877 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -219,7 +219,8 @@ struct inode;
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
  * moment. Note that we have no way to track which tasks are using
- * a page.
+ * a page, though if it is a pagecache page, rmap structures can tell us
+ * who is mapping it.
  */
 struct page {
 	unsigned long flags;		/* Atomic flags, some possibly
@@ -299,8 +300,7 @@ struct page {
  */
 
 /*
- * Drop a ref, return true if the logical refcount fell to zero (the page has
- * no users)
+ * Drop a ref, return true if the refcount fell to zero (the page has no users)
  */
 static inline int put_page_testzero(struct page *page)
 {
@@ -356,43 +356,55 @@ void split_page(struct page *page, unsigned int order);
  * For the non-reserved pages, page_count(page) denotes a reference count.
  *   page_count() == 0 means the page is free. page->lru is then used for
  *   freelist management in the buddy allocator.
- *   page_count() == 1 means the page is used for exactly one purpose
- *   (e.g. a private data page of one process).
+ *   page_count() > 0  means the page has been allocated.
  *
- * A page may be used for kmalloc() or anyone else who does a
- * __get_free_page(). In this case the page_count() is at least 1, and
- * all other fields are unused but should be 0 or NULL. The
- * management of this page is the responsibility of the one who uses
- * it.
+ * Pages are allocated by the slab allocator in order to provide memory
+ * to kmalloc and kmem_cache_alloc. In this case, the management of the
+ * page, and the fields in 'struct page' are the responsibility of mm/slab.c
+ * unless a particular usage is carefully commented. (the responsibility of
+ * freeing the kmalloc memory is the caller's, of course).
  *
- * The other pages (we may call them "process pages") are completely
+ * A page may be used by anyone else who does a __get_free_page().
+ * In this case, page_count still tracks the references, and should only
+ * be used through the normal accessor functions. The top bits of page->flags
+ * and page->virtual store page management information, but all other fields
+ * are unused and could be used privately, carefully. The management of this
+ * page is the responsibility of the one who allocated it, and those who have
+ * subsequently been given references to it.
+ *
+ * The other pages (we may call them "pagecache pages") are completely
  * managed by the Linux memory manager: I/O, buffers, swapping etc.
  * The following discussion applies only to them.
  *
- * A page may belong to an inode's memory mapping. In this case,
- * page->mapping is the pointer to the inode, and page->index is the
- * file offset of the page, in units of PAGE_CACHE_SIZE.
+ * A pagecache page contains an opaque `private' member, which belongs to the
+ * page's address_space. Usually, this is the address of a circular list of
+ * the page's disk buffers. PG_private must be set to tell the VM to call
+ * into the filesystem to release these pages.
  *
- * A page contains an opaque `private' member, which belongs to the
- * page's address_space.  Usually, this is the address of a circular
- * list of the page's disk buffers.
+ * A page may belong to an inode's memory mapping. In this case, page->mapping
+ * is the pointer to the inode, and page->index is the file offset of the page,
+ * in units of PAGE_CACHE_SIZE.
  *
- * For pages belonging to inodes, the page_count() is the number of
- * attaches, plus 1 if `private' contains something, plus one for
- * the page cache itself.
+ * If pagecache pages are not associated with an inode, they are said to be
+ * anonymous pages. These may become associated with the swapcache, and in that
+ * case PG_swapcache is set, and page->private is an offset into the swapcache.
  *
- * Instead of keeping dirty/clean pages in per address-space lists, we instead
- * now tag pages as dirty/under writeback in the radix tree.
+ * In either case (swapcache or inode backed), the pagecache itself holds one
+ * reference to the page. Setting PG_private should also increment the
+ * refcount. The each user mapping also has a reference to the page.
  *
- * There is also a per-mapping radix tree mapping index to the page
- * in memory if present. The tree is rooted at mapping->root.  
+ * The pagecache pages are stored in a per-mapping radix tree, which is
+ * rooted at mapping->page_tree, and indexed by offset.
+ * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
+ * lists, we instead now tag pages as dirty/writeback in the radix tree.
  *
- * All process pages can do I/O:
+ * All pagecache pages may be subject to I/O:
  * - inode pages may need to be read from disk,
  * - inode pages which have been modified and are MAP_SHARED may need
- *   to be written to disk,
- * - private pages which have been modified may need to be swapped out
- *   to swap space and (later) to be read back into memory.
+ *   to be written back to the inode on disk,
+ * - anonymous pages (including MAP_PRIVATE file mappings) which have been
+ *   modified may need to be swapped out to swap space and (later) to be read
+ *   back into memory.
  */
 
 /*
-- 
cgit v1.1


From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:55 -0700
Subject: [PATCH] NUMA: Add zone_to_nid function

There are many places where we need to determine the node of a zone.
Currently we use a difficult to read sequence of pointer dereferencing.
Put that into an inline function and use throughout VM.  Maybe we can find
a way to optimize the lookup in the future.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f201877..856f0ee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,12 +499,17 @@ static inline struct zone *page_zone(struct page *page)
 	return zone_table[page_zone_id(page)];
 }
 
+static inline unsigned long zone_to_nid(struct zone *zone)
+{
+	return zone->zone_pgdat->node_id;
+}
+
 static inline unsigned long page_to_nid(struct page *page)
 {
 	if (FLAGS_HAS_NODE)
 		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 	else
-		return page_zone(page)->zone_pgdat->node_id;
+		return zone_to_nid(page_zone(page));
 }
 static inline unsigned long page_to_section(struct page *page)
 {
-- 
cgit v1.1


From c713216deebd95d2b0ab38fef8bb2361c0180c2d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 27 Sep 2006 01:49:43 -0700
Subject: [PATCH] Introduce mechanism for registering active regions of memory

At a basic level, architectures define structures to record where active
ranges of page frames are located.  Once located, the code to calculate zone
sizes and holes in each architecture is very similar.  Some of this zone and
hole sizing code is difficult to read for no good reason.  This set of patches
eliminates the similar-looking architecture-specific code.

The patches introduce a mechanism where architectures register where the
active ranges of page frames are with add_active_range().  When all areas have
been discovered, free_area_init_nodes() is called to initialise the pgdat and
zones.  The zone sizes and holes are then calculated in an architecture
independent manner.

Patch 1 introduces the mechanism for registering and initialising PFN ranges
Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed
Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed
Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed
Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed
Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable.
	It adjusts the watermarks slightly

Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig,
gensparse_defconfig and defconfig.  Bob Picco has also tested and debugged on
IA64.  Jack Steiner successfully boot tested on a mammoth SGI IA64-based
machine.  These were on patches against 2.6.17-rc1 and release 3 of these
patches but there have been no ia64-changes since release 3.

There are differences in the zone sizes for x86_64 as the arch-specific code
for x86_64 accounts the kernel image and the starting mem_maps as memory holes
but the architecture-independent code accounts the memory as present.

The big benefit of this set of patches is a sizable reduction of
architecture-specific code, some of which is very hairy.  There should be a
greater reduction when other architectures use the same mechanisms for zone
and hole sizing but I lack the hardware to test on.

Additional credit;
	Dave Hansen for the initial suggestion and comments on early patches
	Andy Whitcroft for reviewing early versions and catching numerous
		errors
	Tony Luck for testing and debugging on IA64
	Bob Picco for fixing bugs related to pfn registration, reviewing a
		number of patch revisions, providing a number of suggestions
		on future direction and testing heavily
	Jack Steiner and Robin Holt for testing on IA64 and clarifying
		issues related to memory holes
	Yasunori for testing on IA64
	Andi Kleen for reviewing and feeding back about x86_64
	Christian Kujau for providing valuable information related to ACPI
		problems on x86_64 and testing potential fixes

This patch:

Define the structure to represent an active range of page frames within a node
in an architecture independent manner.  Architectures are expected to register
active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call
free_area_init_nodes() passing the PFNs of the end of each zone.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 856f0ee..c0402da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,53 @@ extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+/*
+ * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
+ * zones, allocate the backing mem_map and account for memory holes in a more
+ * architecture independent manner. This is a substitute for creating the
+ * zone_sizes[] and zholes_size[] arrays and passing them to
+ * free_area_init_node()
+ *
+ * An architecture is expected to register range of page frames backed by
+ * physical memory with add_active_range() before calling
+ * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
+ * usage, an architecture is expected to do something like
+ *
+ * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
+ * 							 max_highmem_pfn};
+ * for_each_valid_physical_page_range()
+ * 	add_active_range(node_id, start_pfn, end_pfn)
+ * free_area_init_nodes(max_zone_pfns);
+ *
+ * If the architecture guarantees that there are no holes in the ranges
+ * registered with add_active_range(), free_bootmem_active_regions()
+ * will call free_bootmem_node() for each registered physical page range.
+ * Similarly sparse_memory_present_with_active_regions() calls
+ * memory_present() for each range when SPARSEMEM is enabled.
+ *
+ * See mm/page_alloc.c for more information on each function exposed by
+ * CONFIG_ARCH_POPULATES_NODE_MAP
+ */
+extern void free_area_init_nodes(unsigned long *max_zone_pfn);
+extern void add_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
+						unsigned long new_end_pfn);
+extern void remove_all_active_ranges(void);
+extern unsigned long absent_pages_in_range(unsigned long start_pfn,
+						unsigned long end_pfn);
+extern void get_pfn_range_for_nid(unsigned int nid,
+			unsigned long *start_pfn, unsigned long *end_pfn);
+extern unsigned long find_min_pfn_with_active_regions(void);
+extern unsigned long find_max_pfn_with_active_regions(void);
+extern void free_bootmem_with_active_regions(int nid,
+						unsigned long max_low_pfn);
+extern void sparse_memory_present_with_active_regions(int nid);
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+extern int early_pfn_to_nid(unsigned long pfn);
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
-- 
cgit v1.1


From 0e0b864e069c52a7b3e4a7da56e29b03a012fd75 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 27 Sep 2006 01:49:56 -0700
Subject: [PATCH] Account for memmap and optionally the kernel image as holes

The x86_64 code accounted for memmap and some portions of the the DMA zone as
holes.  This was because those areas would never be reclaimed and accounting
for them as memory affects min watermarks.  This patch will account for the
memmap as a memory hole.  Architectures may optionally use set_dma_reserve()
if they wish to account for a portion of memory in ZONE_DMA as a hole.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c0402da..22936e1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -984,6 +984,7 @@ extern void sparse_memory_present_with_active_regions(int nid);
 extern int early_pfn_to_nid(unsigned long pfn);
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
 extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
-- 
cgit v1.1


From fb01439c5b778d5974a488c5d4fe85e6d0e18a68 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@skynet.ie>
Date: Wed, 27 Sep 2006 01:49:59 -0700
Subject: [PATCH] Allow an arch to expand node boundaries

Arch-independent zone-sizing determines the size of a node
(pgdat->node_spanned_pages) based on the physical memory that was
registered by the architecture.  However, when
CONFIG_MEMORY_HOTPLUG_RESERVE is set, the architecture expects that the
spanned_pages will be much larger and that mem_map will be allocated that
is used lated on memory hot-add.

This patch allows an architecture that sets CONFIG_MEMORY_HOTPLUG_RESERVE
to call push_node_boundaries() which will set the node beginning and end to
at *least* the requested boundary.

Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Andi Kleen <ak@muc.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Keith Mannthey" <kmannth@gmail.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22936e1..9d046db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -970,6 +970,8 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
 						unsigned long new_end_pfn);
+extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
-- 
cgit v1.1


From 5b99cd0effaf846240a15441aec459a592577eaf Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 27 Sep 2006 01:50:01 -0700
Subject: [PATCH] own header file for struct page

This moves the definition of struct page from mm.h to its own header file
page-struct.h.  This is a prereq to fix SetPageUptodate which is broken on
s390:

#define SetPageUptodate(_page)
       do {
               struct page *__page = (_page);
               if (!test_and_set_bit(PG_uptodate, &__page->flags))
                       page_test_and_clear_dirty(_page);
       } while (0)

_page gets used twice in this macro which can cause subtle bugs.  Using
__page for the page_test_and_clear_dirty call doesn't work since it causes
yet another problem with the page_test_and_clear_dirty macro as well.

In order to avoid all these problems caused by macros it seems to be a good
idea to get rid of them and convert them to static inline functions.
Because of header file include order it's necessary to have a seperate
header file for the struct page definition.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 62 +-----------------------------------------------------
 1 file changed, 1 insertion(+), 61 deletions(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d046db..7477fb5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/debug_locks.h>
 #include <linux/backing-dev.h>
+#include <linux/mm_types.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -215,62 +216,6 @@ struct vm_operations_struct {
 struct mmu_gather;
 struct inode;
 
-/*
- * Each physical page in the system has a struct page associated with
- * it to keep track of whatever it is we are using the page for at the
- * moment. Note that we have no way to track which tasks are using
- * a page, though if it is a pagecache page, rmap structures can tell us
- * who is mapping it.
- */
-struct page {
-	unsigned long flags;		/* Atomic flags, some possibly
-					 * updated asynchronously */
-	atomic_t _count;		/* Usage count, see below. */
-	atomic_t _mapcount;		/* Count of ptes mapped in mms,
-					 * to show when page is mapped
-					 * & limit reverse map searches.
-					 */
-	union {
-	    struct {
-		unsigned long private;		/* Mapping-private opaque data:
-					 	 * usually used for buffer_heads
-						 * if PagePrivate set; used for
-						 * swp_entry_t if PageSwapCache;
-						 * indicates order in the buddy
-						 * system if PG_buddy is set.
-						 */
-		struct address_space *mapping;	/* If low bit clear, points to
-						 * inode address_space, or NULL.
-						 * If page mapped as anonymous
-						 * memory, low bit is set, and
-						 * it points to anon_vma object:
-						 * see PAGE_MAPPING_ANON below.
-						 */
-	    };
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
-	    spinlock_t ptl;
-#endif
-	};
-	pgoff_t index;			/* Our offset within mapping. */
-	struct list_head lru;		/* Pageout list, eg. active_list
-					 * protected by zone->lru_lock !
-					 */
-	/*
-	 * On machines where all RAM is mapped into kernel address space,
-	 * we can simply calculate the virtual address. On machines with
-	 * highmem some memory is mapped into kernel virtual memory
-	 * dynamically, so we need a place to store that address.
-	 * Note that this field could be 16 bits on x86 ... ;)
-	 *
-	 * Architectures with slow multiplication can define
-	 * WANT_PAGE_VIRTUAL in asm/page.h
-	 */
-#if defined(WANT_PAGE_VIRTUAL)
-	void *virtual;			/* Kernel virtual address (NULL if
-					   not kmapped, ie. highmem) */
-#endif /* WANT_PAGE_VIRTUAL */
-};
-
 #define page_private(page)		((page)->private)
 #define set_page_private(page, v)	((page)->private = (v))
 
@@ -546,11 +491,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 #include <linux/vmstat.h>
 
-#ifndef CONFIG_DISCONTIGMEM
-/* The array of struct pages - for discontigmem use pgdat->lmem_map */
-extern struct page *mem_map;
-#endif
-
 static __always_inline void *lowmem_page_address(struct page *page)
 {
 	return __va(page_to_pfn(page) << PAGE_SHIFT);
-- 
cgit v1.1


From d5f541ed6e31518508c688912e7464facf253c87 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 27 Sep 2006 01:50:08 -0700
Subject: [PATCH] Add node to zone for the NUMA case

Add the node in order to optimize zone_to_nid.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7477fb5..8e433bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -446,7 +446,11 @@ static inline struct zone *page_zone(struct page *page)
 
 static inline unsigned long zone_to_nid(struct zone *zone)
 {
-	return zone->zone_pgdat->node_id;
+#ifdef CONFIG_NUMA
+	return zone->node;
+#else
+	return 0;
+#endif
 }
 
 static inline unsigned long page_to_nid(struct page *page)
-- 
cgit v1.1


From f4b81804a2d1ab341a4613089dc31ecce0800ed8 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 27 Sep 2006 01:50:10 -0700
Subject: [PATCH] do_no_pfn()

Implement do_no_pfn() for handling mapping of memory without a struct page
backing it.  This avoids creating fake page table entries for regions which
are not backed by real memory.

This feature is used by the MSPEC driver and other users, where it is
highly undesirable to have a struct page sitting behind the page (for
instance if the page is accessed in cached mode via the struct page in
parallel to the the driver accessing it uncached, which can result in data
corruption on some architectures, such as ia64).

This version uses specific NOPFN_{SIGBUS,OOM} return values, rather than
expect all negative pfn values would be an error.  It also bugs on cow
mappings as this would not work with the VM.

[akpm@osdl.org: micro-optimise]
Signed-off-by: Jes Sorensen <jes@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8e433bb..22165cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,7 @@ struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
+	unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
 
 	/* notification that a previously read-only page is about to become
@@ -594,6 +595,12 @@ static inline int page_mapped(struct page *page)
 #define NOPAGE_OOM	((struct page *) (-1))
 
 /*
+ * Error return values for the *_nopfn functions
+ */
+#define NOPFN_SIGBUS	((unsigned long) -1)
+#define NOPFN_OOM	((unsigned long) -2)
+
+/*
  * Different kinds of faults, as returned by handle_mm_fault().
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
-- 
cgit v1.1


From f269fdd1829acc5e53bf57b145003e5733133f2b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Sep 2006 01:50:23 -0700
Subject: [PATCH] NOMMU: move the fallback arch_vma_name() to a sensible place

Move the fallback arch_vma_name() to a sensible place (kernel/signal.c).

Currently it's in fs/proc/task_mmu.c, a file that is dependent on both
CONFIG_PROC_FS and CONFIG_MMU being enabled, but it's used from
kernel/signal.c from where it is called unconditionally.

[akpm@osdl.org: build fix]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22165cb..7b703b6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1131,7 +1131,7 @@ void drop_slab(void);
 extern int randomize_va_space;
 #endif
 
-const char *arch_vma_name(struct vm_area_struct *vma);
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
-- 
cgit v1.1