From e2cb96b7ecba46888cf00252ffdb8ef1e92c4258 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 19 Aug 2008 08:51:22 -0500 Subject: slub: Disable NUMA remote node defragmentation by default Switch remote node defragmentation off by default. The current settings can cause excessive node local allocations with hackbench: SLAB: % cat /proc/meminfo MemTotal: 7701760 kB MemFree: 5940096 kB Slab: 123840 kB SLUB: % cat /proc/meminfo MemTotal: 7701376 kB MemFree: 4740928 kB Slab: 1591680 kB [Note: this feature is not related to slab defragmentation.] You can find the original discussion here: http://lkml.org/lkml/2008/8/4/308 Reported-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4f5b961..fb486d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2312,7 +2312,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -4058,7 +4058,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, if (err) return err; - if (ratio < 100) + if (ratio <= 100) s->remote_node_defrag_ratio = ratio * 10; return length; -- cgit v1.1 From 16f8c5b2e64dec7faa5d3c7e9bdf0765e864e481 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 20 Aug 2008 14:09:04 -0700 Subject: mm: page_remove_rmap comments on PageAnon Add a comment to s390's page_test_dirty/page_clear_dirty/page_set_dirty dance in page_remove_rmap(): I was wrong to think the PageSwapCache test could be avoided, and would like a comment in there to remind me. And mention s390, to help us remember that this block is not really common. Also move down the "It would be tidy to reset PageAnon" comment: it does not belong to s390's block, and it would be unwise to reset PageAnon before we're done with testing it. Signed-off-by: Hugh Dickins Acked-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 1ea4e6f..0597747 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -659,23 +659,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) } /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. */ if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { page_clear_dirty(page); set_page_dirty(page); } - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } } -- cgit v1.1 From 07279cdfd964acc032de92a527cb11b1f40f35aa Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 20 Aug 2008 14:09:05 -0700 Subject: mm: show free swap as signed Adjust m show_swap_cache_info() to show "Free swap" as a signed long: the signed format is preferable, because during swapoff nr_swap_pages can legitimately go negative, so makes more sense thus (it used to be shown redundantly, once as signed and once as unsigned). Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index 167cf2d..797c383 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -60,7 +60,7 @@ void show_swap_cache_info(void) printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } -- cgit v1.1 From 759f9a2df78d2156a2675edc7999fb4c919a3159 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Wed, 20 Aug 2008 14:09:06 -0700 Subject: mm: mminit_loglevel cannot be __meminitdata anymore mminit_loglevel is now used from mminit_verify_zonelist <- build_all_zonelists <- 1. online_pages <- memory_block_action <- memory_block_change_state <- store_mem_state (sys handler) 2. numa_zonelist_order_handler (proc handler) so it cannot be annotated __meminit - drop it fixes following section mismatch warning: WARNING: vmlinux.o(.text+0x71628): Section mismatch in reference from the function mminit_verify_zonelist() to the variable .meminit.data:mminit_loglevel The function mminit_verify_zonelist() references the variable __meminitdata mminit_loglevel. This is often because mminit_verify_zonelist lacks a __meminitdata annotation or the annotation of mminit_loglevel is wrong. Signed-off-by: Marcin Slusarz Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 936ef2e..4e0e265 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -12,7 +12,7 @@ #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT -int __meminitdata mminit_loglevel; +int mminit_loglevel; #ifndef SECTIONS_SHIFT #define SECTIONS_SHIFT 0 -- cgit v1.1 From 481ebd0d76b501c5772f702ae31e55350c0858a3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Aug 2008 14:09:15 -0700 Subject: bootmem: fix aligning of node-relative indexes and offsets Absolute alignment requirements may never be applied to node-relative offsets. Andreas Herrmann spotted this flaw when a bootmem allocation on an unaligned node was itself not aligned because the combination of an unaligned node with an aligned offset into that node is not garuanteed to be aligned itself. This patch introduces two helper functions that align a node-relative index or offset with respect to the node's starting address so that the absolute PFN or virtual address that results from combining the two satisfies the requested alignment. Then all the broken ALIGN()s in alloc_bootmem_core() are replaced by these helpers. Signed-off-by: Johannes Weiner Reported-by: Andreas Herrmann Debugged-by: Andreas Herrmann Reviewed-by: Andreas Herrmann Tested-by: Andreas Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index e023c68..ad8eec6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) +{ + unsigned long base = bdata->node_min_pfn; + + /* + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. + */ + + return ALIGN(base + idx, step) - base; +} + +static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, + unsigned long align) +{ + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; +} + static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, else start = ALIGN(min, step); - sidx = start - bdata->node_min_pfn;; + sidx = start - bdata->node_min_pfn; midx = max - bdata->node_min_pfn; if (bdata->hint_idx > sidx) { @@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, * catch the fallback below. */ fallback = sidx + 1; - sidx = ALIGN(bdata->hint_idx, step); + sidx = align_idx(bdata, bdata->hint_idx, step); } while (1) { @@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long eidx, i, start_off, end_off; find_block: sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = ALIGN(sidx, step); + sidx = align_idx(bdata, sidx, step); eidx = sidx + PFN_UP(size); if (sidx >= midx || eidx > midx) @@ -467,7 +490,7 @@ find_block: for (i = sidx; i < eidx; i++) if (test_bit(i, bdata->node_bootmem_map)) { - sidx = ALIGN(i, step); + sidx = align_idx(bdata, i, step); if (sidx == i) sidx += step; goto find_block; @@ -475,7 +498,7 @@ find_block: if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = ALIGN(bdata->last_end_off, align); + start_off = align_off(bdata, bdata->last_end_off, align); else start_off = PFN_PHYS(sidx); @@ -499,7 +522,7 @@ find_block: } if (fallback) { - sidx = ALIGN(fallback - 1, step); + sidx = align_idx(bdata, fallback - 1, step); fallback = 0; goto find_block; } -- cgit v1.1 From 479db0bf408e65baa14d2a9821abfcbc0804b847 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 20 Aug 2008 14:09:18 -0700 Subject: mm: dirty page tracking race fix There is a race with dirty page accounting where a page may not properly be accounted for. clear_page_dirty_for_io() calls page_mkclean; then TestClearPageDirty. page_mkclean walks the rmaps for that page, and for each one it cleans and write protects the pte if it was dirty. It uses page_check_address to find the pte. That function has a shortcut to avoid the ptl if the pte is not present. Unfortunately, the pte can be switched to not-present then back to present by other code while holding the page table lock -- this should not be a signal for page_mkclean to ignore that pte, because it may be dirty. For example, powerpc64's set_pte_at will clear a previously present pte before setting it to the desired value. There may also be other code in core mm or in arch which do similar things. The consequence of the bug is loss of data integrity due to msync, and loss of dirty page accounting accuracy. XIP's __xip_unmap could easily also be unreliable (depending on the exact XIP locking scheme), which can lead to data corruption. Fix this by having an option to always take ptl to check the pte in page_check_address. It's possible to retain this optimization for page_referenced and try_to_unmap. Signed-off-by: Nick Piggin Cc: Jared Hulbert Cc: Carsten Otte Cc: Hugh Dickins Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 2 +- mm/rmap.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 380ab40..8b710ca 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -185,7 +185,7 @@ __xip_unmap (struct address_space * mapping, address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); diff --git a/mm/rmap.c b/mm/rmap.c index 0597747..0383acf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp) + unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; pud_t *pud; @@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { + if (!sync && !pte_present(*pte)) { pte_unmap(pte); return NULL; } @@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -704,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; -- cgit v1.1 From 538f8ea6c85232d00bfa5edd9ba85f16c01057c9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 20 Aug 2008 14:09:20 -0700 Subject: mm: xip fix fault vs sparse page invalidate race XIP has a race between sparse pages being inserted into page tables, and sparse pages being zapped when its time to put a non-sparse page in. What can happen is that a process can be left with a dangling sparse page in a MAP_SHARED mapping, while the rest of the world sees the non-sparse version. Ie. data corruption. Guard these operations with a seqlock, making fault-in-sparse-pages the slowpath, and try-to-unmap-sparse-pages the fastpath. Signed-off-by: Nick Piggin Cc: Jared Hulbert Acked-by: Carsten Otte Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 60 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 8b710ca..5b9ec47 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -22,22 +24,18 @@ * We do use our own empty page to avoid interference with other users * of ZERO_PAGE(), such as /dev/zero */ +static DEFINE_MUTEX(xip_sparse_mutex); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; static struct page *__xip_sparse_page; +/* called under xip_sparse_mutex */ static struct page *xip_sparse_page(void) { if (!__xip_sparse_page) { struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - if (page) { - static DEFINE_SPINLOCK(xip_alloc_lock); - spin_lock(&xip_alloc_lock); - if (!__xip_sparse_page) - __xip_sparse_page = page; - else - __free_page(page); - spin_unlock(&xip_alloc_lock); - } + if (page) + __xip_sparse_page = page; } return __xip_sparse_page; } @@ -174,11 +172,16 @@ __xip_unmap (struct address_space * mapping, pte_t pteval; spinlock_t *ptl; struct page *page; + unsigned count; + int locked = 0; + + count = read_seqcount_begin(&xip_sparse_seq); page = __xip_sparse_page; if (!page) return; +retry: spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; @@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping, } } spin_unlock(&mapping->i_mmap_lock); + + if (locked) { + mutex_unlock(&xip_sparse_mutex); + } else if (read_seqcount_retry(&xip_sparse_seq, count)) { + mutex_lock(&xip_sparse_mutex); + locked = 1; + goto retry; + } } /* @@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; /* XXX: are VM_FAULT_ codes OK? */ - +again: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -245,6 +256,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) __xip_unmap(mapping, vmf->pgoff); found: + printk("%s insert %lx@%lx\n", current->comm, (unsigned long)vmf->virtual_address, xip_pfn); err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn); if (err == -ENOMEM) @@ -252,14 +264,34 @@ found: BUG_ON(err); return VM_FAULT_NOPAGE; } else { + int err, ret = VM_FAULT_OOM; + + mutex_lock(&xip_sparse_mutex); + write_seqcount_begin(&xip_sparse_seq); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (unlikely(!error)) { + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + goto again; + } + if (error != -ENODATA) + goto out; /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return VM_FAULT_OOM; + goto out; + err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, + page); + if (err == -ENOMEM) + goto out; - page_cache_get(page); - vmf->page = page; - return 0; + ret = VM_FAULT_NOPAGE; +out: + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + + return ret; } } -- cgit v1.1 From 14bac5acfdb6a40be64acc042c6db73f1a68f6a4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 20 Aug 2008 14:09:20 -0700 Subject: mm: xip/ext2 fix block allocation race XIP can call into get_xip_mem concurrently with the same file,offset with create=1. This usually maps down to get_block, which expects the page lock to prevent such a situation. This causes ext2 to explode for one reason or another. Serialise those calls for the moment. For common usages today, I suspect get_xip_mem rarely is called to create new blocks. In future as XIP technologies evolve we might need to look at which operations require scalability, and rework the locking to suit. Signed-off-by: Nick Piggin Cc: Jared Hulbert Acked-by: Carsten Otte Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 5b9ec47..b5167df 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -248,15 +248,16 @@ again: int err; /* maybe shared writable, allocate new block */ + mutex_lock(&xip_sparse_mutex); error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (error) return VM_FAULT_SIGBUS; /* unmap sparse mappings at pgoff from all other vmas */ __xip_unmap(mapping, vmf->pgoff); found: - printk("%s insert %lx@%lx\n", current->comm, (unsigned long)vmf->virtual_address, xip_pfn); err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn); if (err == -ENOMEM) @@ -340,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, &xip_mem, &xip_pfn); if (status == -ENODATA) { /* we allocate a new page unmap it */ + mutex_lock(&xip_sparse_mutex); status = a_ops->get_xip_mem(mapping, index, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); -- cgit v1.1 From e80d6a248298721e0ec2cac150c539d8378577d8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Aug 2008 11:10:14 +0100 Subject: [ARM] Skip memory holes in FLATMEM when reading /proc/pagetypeinfo Ordinarily, memory holes in flatmem still have a valid memmap and is safe to use. However, an architecture (ARM) frees up the memmap backing memory holes on the assumption it is never used. /proc/pagetypeinfo reads the whole range of pages in a zone believing that the memmap is valid and that pfn_valid will return false if it is not. On ARM, freeing the memmap breaks the page->zone linkages even though pfn_valid() returns true and the kernel can oops shortly afterwards due to accessing a bogus struct zone *. This patch lets architectures say when FLATMEM can have holes in the memmap. Rather than an expensive check for valid memory, /proc/pagetypeinfo will confirm that the page linkages are still valid by checking page->zone is still the expected zone. The lookup of page_zone is safe as there is a limited range of memory that is accessed when calling page_zone. Even if page_zone happens to return the correct zone, the impact is that the counters in /proc/pagetypeinfo are slightly off but fragmentation monitoring is unlikely to be relevant on an embedded system. Reported-by: H Hartley Sweeten Signed-off-by: Mel Gorman Tested-by: H Hartley Sweeten Signed-off-by: Russell King --- mm/vmstat.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index b0d08e6..d7826af 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); +#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES + /* + * Ordinarily, memory holes in flatmem still have a valid + * memmap for the PFN range. However, an architecture for + * embedded systems (e.g. ARM) can free up the memmap backing + * holes to save memory on the assumption the memmap is + * never used. The page_zone linkages are then broken even + * though pfn_valid() returns true. Skip the page if the + * linkages are broken. Even if this test passed, the impact + * is that the counters for the movable type are off but + * fragmentation monitoring is likely meaningless on small + * systems. + */ + if (page_zone(page) != zone) + continue; +#endif mtype = get_pageblock_migratetype(page); - count[mtype]++; + if (mtype < MIGRATE_TYPES) + count[mtype]++; } /* Print counts */ -- cgit v1.1 From 0ed97ee470c36e05bcaad36c4fb4c501f383ce63 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 1 Sep 2008 11:10:28 +0100 Subject: Remove '#include ' from mm/page_isolation.c Signed-off-by: David Woodhouse --- mm/page_isolation.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58..c69f84f 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -2,7 +2,6 @@ * linux/mm/page_isolation.c */ -#include #include #include #include -- cgit v1.1 From 344c790e3821dac37eb742ddd0b611a300f78b9a Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Tue, 2 Sep 2008 14:35:38 -0700 Subject: mm: make setup_zone_migrate_reserve() aware of overlapping nodes I have gotten to the root cause of the hugetlb badness I reported back on August 15th. My system has the following memory topology (note the overlapping node): Node 0 Memory: 0x8000000-0x44000000 Node 1 Memory: 0x0-0x8000000 0x44000000-0x80000000 setup_zone_migrate_reserve() scans the address range 0x0-0x8000000 looking for a pageblock to move onto the MIGRATE_RESERVE list. Finding no candidates, it happily continues the scan into 0x8000000-0x44000000. When a pageblock is found, the pages are moved to the MIGRATE_RESERVE list on the wrong zone. Oops. setup_zone_migrate_reserve() should skip pageblocks in overlapping nodes. Signed-off-by: Adam Litke Acked-by: Mel Gorman Cc: Dave Hansen Cc: Nishanth Aravamudan Cc: Andy Whitcroft Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index af982f7..feb7916 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -694,6 +694,9 @@ static int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; @@ -2516,6 +2519,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; page = pfn_to_page(pfn); + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + /* Blocks with reserved pages will never free, skip them. */ if (PageReserved(page)) continue; -- cgit v1.1 From 6ccfa806a9cfbbf1cd43d5b6aa47ef2c0eb518fd Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Tue, 2 Sep 2008 14:35:40 -0700 Subject: VFS: fix dio write returning EIO when try_to_release_page fails Dio write returns EIO when try_to_release_page fails because bh is still referenced. The patch commit 3f31fddfa26b7594b44ff2b34f9a04ba409e0f91 Author: Mingming Cao Date: Fri Jul 25 01:46:22 2008 -0700 jbd: fix race between free buffer and commit transaction was merged into 2.6.27-rc1, but I noticed that this patch is not enough to fix the race. I did fsstress test heavily to 2.6.27-rc1, and found that dio write still sometimes got EIO through this test. The patch above fixed race between freeing buffer(dio) and committing transaction(jbd) but I discovered that there is another race, freeing buffer(dio) and ext3/4_ordered_writepage. : background_writeout() ->write_cache_pages() ->ext3_ordered_writepage() walk_page_buffers() -> take a bh ref block_write_full_page() -> unlock_page : <- end_page_writeback : <- race! (dio write->try_to_release_page fails) walk_page_buffers() ->release a bh ref ext3_ordered_writepage holds bh ref and does unlock_page remaining taking a bh ref, so this causes the race and failure of try_to_release_page. To fix this race, I used the approach of falling back to buffered writes if try_to_release_page() fails on a page. [akpm@linux-foundation.org: cleanups] Signed-off-by: Hisashi Hifumi Cc: Chris Mason Cc: Jan Kara Cc: Mingming Cao Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 +++++++++-- mm/truncate.c | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 54e9686..876bc59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2129,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + * without clobbering -EIOCBQUEUED from ->direct_IO(). */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); - if (written) + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; goto out; + } } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); diff --git a/mm/truncate.c b/mm/truncate.c index 2505050..6650c1d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) - ret2 = -EIO; + ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; -- cgit v1.1 From 527655835ebac8f58a8f800a10700712a4c2affd Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Tue, 2 Sep 2008 14:35:41 -0700 Subject: mm/bootmem: silence section mismatch warning - contig_page_data/bootmem_node_data WARNING: vmlinux.o(.data+0x1f5c0): Section mismatch in reference from the variable contig_page_data to the variable .init.data:bootmem_node_data The variable contig_page_data references the variable __initdata bootmem_node_data If the reference is valid then annotate the variable with __init* (see linux/init.h) or name the variable: *driver, *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console, Signed-off-by: Marcin Slusarz Cc: Johannes Weiner Cc: Sean MacLennan Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index feb7916..e293c58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif -- cgit v1.1 From b954185214c3b562c3fcc651e9ec69d421d76bfa Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 2 Sep 2008 14:35:58 -0700 Subject: mm: size of quicklists shouldn't be proportional to the number of CPUs Quicklists store pages for each CPU as caches. (Each CPU can cache node_free_pages/16 pages) It is used for page table cache. exit() will increase the cache size, while fork() consumes it. So for example if an apache-style application runs (one parent and many child model), one CPU process will fork() while another CPU will process the middleware work and exit(). At that time, the CPU on which the parent runs doesn't have page table cache at all. Others (on which children runs) have maximum caches. QList_max = (#ofCPUs - 1) x Free / 16 => QList_max / (Free + QList_max) = (#ofCPUs - 1) / (16 + #ofCPUs - 1) So, How much quicklist memory is used in the maximum case? This is proposional to # of CPUs because the limit of per cpu quicklist cache doesn't see the number of cpus. Above calculation mean Number of CPUs per node 2 4 8 16 ============================== ==================== QList_max / (Free + QList_max) 5.8% 16% 30% 48% Wow! Quicklist can spend about 50% memory at worst case. My demonstration program is here -------------------------------------------------------------------------------- #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define BUFFSIZE 512 int max_cpu(void) /* get max number of logical cpus from /proc/cpuinfo */ { FILE *fd; char *ret, buffer[BUFFSIZE]; int cpu = 1; fd = fopen("/proc/cpuinfo", "r"); if (fd == NULL) { perror("fopen(/proc/cpuinfo)"); exit(EXIT_FAILURE); } while (1) { ret = fgets(buffer, BUFFSIZE, fd); if (ret == NULL) break; if (!strncmp(buffer, "processor", 9)) cpu = atoi(strchr(buffer, ':') + 2); } fclose(fd); return cpu; } void cpu_bind(int cpu) /* bind current process to one cpu */ { cpu_set_t mask; int ret; CPU_ZERO(&mask); CPU_SET(cpu, &mask); ret = sched_setaffinity(0, sizeof(mask), &mask); if (ret == -1) { perror("sched_setaffinity()"); exit(EXIT_FAILURE); } sched_yield(); /* not necessary */ } #define MMAP_SIZE (10 * 1024 * 1024) /* 10 MB */ #define FORK_INTERVAL 1 /* 1 second */ main(int argc, char *argv[]) { int cpu_max, nextcpu; long pagesize; pid_t pid; /* set max number of logical cpu */ if (argc > 1) cpu_max = atoi(argv[1]) - 1; else cpu_max = max_cpu(); /* get the page size */ pagesize = sysconf(_SC_PAGESIZE); if (pagesize == -1) { perror("sysconf(_SC_PAGESIZE)"); exit(EXIT_FAILURE); } /* prepare parent process */ cpu_bind(0); nextcpu = cpu_max; loop: /* select destination cpu for child process by round-robin rule */ if (++nextcpu > cpu_max) nextcpu = 1; pid = fork(); if (pid == 0) { /* child action */ char *p; int i; /* consume page tables */ p = mmap(0, MMAP_SIZE, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); i = MMAP_SIZE / pagesize; while (i-- > 0) { *p = 1; p += pagesize; } /* move to other cpu */ cpu_bind(nextcpu); /* printf("a child moved to cpu%d after mmap().\n", nextcpu); fflush(stdout); */ /* back page tables to pgtable_quicklist */ exit(0); } else if (pid > 0) { /* parent action */ sleep(FORK_INTERVAL); waitpid(pid, NULL, WNOHANG); } goto loop; } ---------------------------------------- When above program which does task migration runs, my 8GB box spends 800MB of memory for quicklist. This is not memory leak but doesn't seem good. % cat /proc/meminfo MemTotal: 7701568 kB MemFree: 4724672 kB (snip) Quicklists: 844800 kB because - My machine spec is number of numa node: 2 number of cpus: 8 (4CPU x2 node) total mem: 8GB (4GB x2 node) free mem: about 5GB - Then, 4.7GB x 16% ~= 880MB. So, Quicklist can use 800MB. So, if following spec machine run that program CPUs: 64 (8cpu x 8node) Mem: 1TB (128GB x8node) Then, quicklist can waste 300GB (= 1TB x 30%). It is too large. So, I don't like cache policies which is proportional to # of cpus. My patch changes the number of caches from: per-cpu-cache-amount = memory_on_node / 16 to per-cpu-cache-amount = memory_on_node / 16 / number_of_cpus_on_node. Signed-off-by: KOSAKI Motohiro Cc: Keiichiro Tokunaga Acked-by: Christoph Lameter Tested-by: David Miller Acked-by: Mike Travis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/quicklist.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7..8dbb680 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + return max(max, min_pages); } -- cgit v1.1 From ce36394269ccd9d1d286d6192ba09fa6894365e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 16:09:47 +0200 Subject: mmap: fix petty bug in anonymous shared mmap offset handling Anonymous mappings should ignore offset but shared anonymous mapping forgot to clear it and makes the following legit test program trigger SIGBUS. #include #include #include #define PAGE_SIZE 4096 int main(void) { char *p; int i; p = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, PAGE_SIZE); if (p == MAP_FAILED) { perror("mmap"); return 1; } for (i = 0; i < 2; i++) { printf("page %d\n", i); p[i * 4096] = i; } return 0; } Fix it. Signed-off-by: Tejun Heo Acked-by: Hugh Dickins Acked-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 339cf5c..e7a5a68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: -- cgit v1.1