From a8062231d80239cf3405982858c02aea21a6066a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 7 Apr 2006 19:49:21 +0200 Subject: [PATCH] x86_64: Handle empty PXMs that only contain hotplug memory The node setup code would try to allocate the node metadata in the node itself, but that fails if there is no memory in there. This can happen with memory hotplug when the hotplug area defines an so far empty node. Now use bootmem to try to allocate the mem_map in other nodes. And if it fails don't panic, but just ignore the node. To make this work I added a new __alloc_bootmem_nopanic function that does what its name implies. TBD should try to use nearby nodes here. Currently we just use any. It's hard to do it better because bootmem doesn't have proper fallback lists yet. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- mm/bootmem.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index d3e3bd2..d213fed 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -401,7 +401,7 @@ unsigned long __init free_all_bootmem (void) return(free_all_bootmem_core(NODE_DATA(0))); } -void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { bootmem_data_t *bdata; void *ptr; @@ -409,7 +409,14 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned list_for_each_entry(bdata, &bdata_list, list) if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) return(ptr); + return NULL; +} +void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) +{ + void *mem = __alloc_bootmem_nopanic(size,align,goal); + if (mem) + return mem; /* * Whoops, we cannot satisfy the allocation request. */ -- cgit v1.1 From 676165a8af7167f488abdcce6851a9bc36e83254 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 10 Apr 2006 11:21:48 +1000 Subject: [PATCH] Fix buddy list race that could lead to page lru list corruptions Rohit found an obscure bug causing buddy list corruption. page_is_buddy is using a non-atomic test (PagePrivate && page_count == 0) to determine whether or not a free page's buddy is itself free and in the buddy lists. Each of the conjuncts may be true at different times due to unrelated conditions, so the non-atomic page_is_buddy test may find each conjunct to be true even if they were not both true at the same time (ie. the page was not on the buddy lists). Signed-off-by: Martin Bligh Signed-off-by: Rohit Seth Signed-off-by: Nick Piggin Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc523a1..b8165e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -151,7 +151,8 @@ static void bad_page(struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ); + 1 << PG_writeback | + 1 << PG_buddy ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -236,12 +237,12 @@ static inline unsigned long page_order(struct page *page) { static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); - __SetPagePrivate(page); + __SetPageBuddy(page); } static inline void rmv_page_order(struct page *page) { - __ClearPagePrivate(page); + __ClearPageBuddy(page); set_page_private(page, 0); } @@ -280,11 +281,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && - * (b) the buddy is free && - * (c) the buddy is on the buddy system && - * (d) a page and its buddy have the same order. - * for recording page's order, we use page_private(page) and PG_private. + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order. + * + * For recording whether a page is in the buddy system, we use PG_buddy. + * Setting, clearing, and testing PG_buddy is serialized by zone->lock. * + * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, int order) { @@ -293,10 +296,10 @@ static inline int page_is_buddy(struct page *page, int order) return 0; #endif - if (PagePrivate(page) && - (page_order(page) == order) && - page_count(page) == 0) + if (PageBuddy(page) && page_order(page) == order) { + BUG_ON(page_count(page) != 0); return 1; + } return 0; } @@ -313,7 +316,7 @@ static inline int page_is_buddy(struct page *page, int order) * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_Private.Page's + * free pages of length of (1 << order) and marked with PG_buddy. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -376,7 +379,8 @@ static inline int free_pages_check(struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -524,7 +528,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); /* -- cgit v1.1 From 5b74ada7eea1b0064d2b72384827853f349d803a Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Mon, 10 Apr 2006 22:52:53 -0700 Subject: [PATCH] slab: allocate node local memory for off-slab slabmanagement Allocate off-slab slab descriptors from node local memory. Signed-off-by: Alok N Kataria Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f055c14..afabad5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2318,13 +2318,15 @@ EXPORT_SYMBOL(kmem_cache_destroy); /* Get the memory for a slab management obj. */ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, - int colour_off, gfp_t local_flags) + int colour_off, gfp_t local_flags, + int nodeid) { struct slab *slabp; if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); + slabp = kmem_cache_alloc_node(cachep->slabp_cache, + local_flags, nodeid); if (!slabp) return NULL; } else { @@ -2334,6 +2336,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; + slabp->nodeid = nodeid; return slabp; } @@ -2519,7 +2522,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); + slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); if (!slabp) goto opps1; -- cgit v1.1 From fb7faf3313d527bf68ba2e7ff3a2b6ebf201af73 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Mon, 10 Apr 2006 22:52:54 -0700 Subject: [PATCH] slab: add statistics for alien cache overflows Add a statistics counter which is incremented everytime the alien cache overflows. alien_cache limit is hardcoded to 12 right now. We can use this statistics to tune alien cache if needed in the future. Signed-off-by: Alok N Kataria Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index afabad5..752c557 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -420,6 +420,7 @@ struct kmem_cache { unsigned long max_freeable; unsigned long node_allocs; unsigned long node_frees; + unsigned long node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; @@ -465,6 +466,7 @@ struct kmem_cache { #define STATS_INC_ERR(x) ((x)->errors++) #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) #define STATS_INC_NODEFREES(x) ((x)->node_frees++) +#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) #define STATS_SET_FREEABLE(x, i) \ do { \ if ((x)->max_freeable < i) \ @@ -484,6 +486,7 @@ struct kmem_cache { #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) #define STATS_INC_NODEFREES(x) do { } while (0) +#define STATS_INC_ACOVERFLOW(x) do { } while (0) #define STATS_SET_FREEABLE(x, i) do { } while (0) #define STATS_INC_ALLOCHIT(x) do { } while (0) #define STATS_INC_ALLOCMISS(x) do { } while (0) @@ -3083,9 +3086,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) + if (unlikely(alien->avail == alien->limit)) { + STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, alien, nodeid); + } alien->entry[alien->avail++] = objp; spin_unlock(&alien->lock); } else { @@ -3763,7 +3768,7 @@ static void print_slabinfo_header(struct seq_file *m) seq_puts(m, " : slabdata "); #if STATS seq_puts(m, " : globalstat " - " "); + " "); seq_puts(m, " : cpustat "); #endif seq_putc(m, '\n'); @@ -3877,11 +3882,12 @@ static int s_show(struct seq_file *m, void *p) unsigned long max_freeable = cachep->max_freeable; unsigned long node_allocs = cachep->node_allocs; unsigned long node_frees = cachep->node_frees; + unsigned long overflows = cachep->node_overflow; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu", allocs, high, grown, + %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, - node_frees); + node_frees, overflows); } /* cpu stats */ { -- cgit v1.1 From d6fef9da19b7acd46e04b7dbbba726b3febeca94 Mon Sep 17 00:00:00 2001 From: Luke Yang Date: Mon, 10 Apr 2006 22:52:56 -0700 Subject: [PATCH] nommu: use compound page in slab allocator The earlier patch to consolidate mmu and nommu page allocation and refcounting by using compound pages for nommu allocations had a bug: kmalloc slabs who's pages were initially allocated by a non-__GFP_COMP allocator could be passed into mm/nommu.c kmalloc allocations which really wanted __GFP_COMP underlying pages. Fix that by having nommu pass __GFP_COMP to all higher order slab allocations. Signed-off-by: Luke Yang Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 752c557..e6ef9bd5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1456,7 +1456,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) int i; flags |= cachep->gfpflags; +#ifndef CONFIG_MMU + /* nommu uses slab's for process anonymous memory allocations, so + * requires __GFP_COMP to properly refcount higher order allocations" + */ + page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); +#else page = alloc_pages_node(nodeid, flags, cachep->gfporder); +#endif if (!page) return NULL; addr = page_address(page); -- cgit v1.1 From 1e624196f43c3a62122959e15c5f03572cdadb5d Mon Sep 17 00:00:00 2001 From: Ram Gupta Date: Mon, 10 Apr 2006 22:52:57 -0700 Subject: [PATCH] mm: fix bug in brk() The code checks for newbrk with oldbrk which are page aligned before making a check for the memory limit set of data segment. If the memory limit is not page aligned in that case it bypasses the test for the limit if the memory allocation is still for the same page. Signed-off-by: Ram Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e780d19..eab6fcb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -220,6 +220,17 @@ asmlinkage unsigned long sys_brk(unsigned long brk) if (brk < mm->end_code) goto out; + + /* + * Check against rlimit here. If this check is done later after the test + * of oldbrk with newbrk then it can escape the test and let the data + * segment grow beyond its set limit the in case where the limit is + * not page aligned -Ram Gupta + */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) @@ -232,11 +243,6 @@ asmlinkage unsigned long sys_brk(unsigned long brk) goto out; } - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - /* Check against existing mmap mappings. */ if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) goto out; -- cgit v1.1 From e23ca00bf1b1c6c0f04702cb4d29e275ab8dc330 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 10 Apr 2006 22:52:57 -0700 Subject: [PATCH] Some page migration fixups - Remove sparse comment - Remove duplicated include - Return the correct error condition in migrate_page_remove_references(). Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 09f6e4a..20b95db 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -16,8 +16,7 @@ #include #include #include -#include /* for try_to_release_page(), - buffer_heads_over_limit */ +#include #include #include #include @@ -28,8 +27,6 @@ #include "internal.h" -#include "internal.h" - /* The maximum number of pages to take off the LRU for migration */ #define MIGRATE_CHUNK_SIZE 256 @@ -234,7 +231,7 @@ int migrate_page_remove_references(struct page *newpage, if (!page_mapping(page) || page_count(page) != nr_refs || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); - return 1; + return -EAGAIN; } /* -- cgit v1.1 From cb45b0e966cbe747b6189c15b108901cc7d6c97c Mon Sep 17 00:00:00 2001 From: Hideo AOKI Date: Mon, 10 Apr 2006 22:52:59 -0700 Subject: [PATCH] overcommit: add calculate_totalreserve_pages() These patches are an enhancement of OVERCOMMIT_GUESS algorithm in __vm_enough_memory(). - why the kernel needed patching When the kernel can't allocate anonymous pages in practice, currnet OVERCOMMIT_GUESS could return success. This implementation might be the cause of oom kill in memory pressure situation. If the Linux runs with page reservation features like /proc/sys/vm/lowmem_reserve_ratio and without swap region, I think the oom kill occurs easily. - the overall design approach in the patch When the OVERCOMMET_GUESS algorithm calculates number of free pages, the reserved free pages are regarded as non-free pages. This change helps to avoid the pitfall that the number of free pages become less than the number which the kernel tries to keep free. - testing results I tested the patches using my test kernel module. If the patches aren't applied to the kernel, __vm_enough_memory() returns success in the situation but autual page allocation is failed. On the other hand, if the patches are applied to the kernel, memory allocation failure is avoided since __vm_enough_memory() returns failure in the situation. I checked that on i386 SMP 16GB memory machine. I haven't tested on nommu environment currently. This patch adds totalreserve_pages for __vm_enough_memory(). Calculate_totalreserve_pages() checks maximum lowmem_reserve pages and pages_high in each zone. Finally, the function stores the sum of each zone to totalreserve_pages. The totalreserve_pages is calculated when the VM is initilized. And the variable is updated when /proc/sys/vm/lowmem_reserve_raito or /proc/sys/vm/min_free_kbytes are changed. Signed-off-by: Hideo Aoki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b8165e0..97d6827 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,6 +51,7 @@ nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; @@ -2477,6 +2478,38 @@ void __init page_alloc_init(void) } /* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + int i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat pages_high as reserved pages. */ + max += zone->pages_high; + + if (max > zone->present_pages) + max = zone->present_pages; + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + +/* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of @@ -2507,6 +2540,9 @@ static void setup_per_zone_lowmem_reserve(void) } } } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2561,6 +2597,9 @@ void setup_per_zone_pages_min(void) zone->pages_high = zone->pages_min + tmp / 2; spin_unlock_irqrestore(&zone->lru_lock, flags); } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* -- cgit v1.1 From 6d9f78396583244258080f3369889644c06c37c8 Mon Sep 17 00:00:00 2001 From: Hideo AOKI Date: Mon, 10 Apr 2006 22:53:00 -0700 Subject: [PATCH] overcommit: use totalreserve_pages This patch is an enhancement of OVERCOMMIT_GUESS algorithm in __vm_enough_memory() in mm/mmap.c. When the OVERCOMMIT_GUESS algorithm calculates the number of free pages, the algorithm subtracts the number of reserved pages from the result nr_free_pages(). Signed-off-by: Hideo Aoki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index eab6fcb..e6ee123 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -121,14 +121,26 @@ int __vm_enough_memory(long pages, int cap_sys_admin) * only call if we're about to fail. */ n = nr_free_pages(); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (n <= totalreserve_pages) + goto error; + else + n -= totalreserve_pages; + + /* + * Leave the last 3% for root + */ if (!cap_sys_admin) n -= n / 32; free += n; if (free > pages) return 0; - vm_unacct_memory(pages); - return -ENOMEM; + + goto error; } allowed = (totalram_pages - hugetlb_total_pages()) @@ -150,7 +162,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) */ if (atomic_read(&vm_committed_space) < (long)allowed) return 0; - +error: vm_unacct_memory(pages); return -ENOMEM; -- cgit v1.1 From d5ddc79bcaab6975e7671805c3578407dc33b764 Mon Sep 17 00:00:00 2001 From: Hideo AOKI Date: Mon, 10 Apr 2006 22:53:01 -0700 Subject: [PATCH] overcommit: use totalreserve_pages for nommu This patch is an enhancement of OVERCOMMIT_GUESS algorithm in __vm_enough_memory() in mm/nommu.c. When the OVERCOMMIT_GUESS algorithm calculates the number of free pages, the algorithm subtracts the number of reserved pages from the result nr_free_pages(). Signed-off-by: Hideo Aoki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index db45efa..029fada 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1147,14 +1147,26 @@ int __vm_enough_memory(long pages, int cap_sys_admin) * only call if we're about to fail. */ n = nr_free_pages(); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (n <= totalreserve_pages) + goto error; + else + n -= totalreserve_pages; + + /* + * Leave the last 3% for root + */ if (!cap_sys_admin) n -= n / 32; free += n; if (free > pages) return 0; - vm_unacct_memory(pages); - return -ENOMEM; + + goto error; } allowed = totalram_pages * sysctl_overcommit_ratio / 100; @@ -1175,7 +1187,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) */ if (atomic_read(&vm_committed_space) < (long)allowed) return 0; - +error: vm_unacct_memory(pages); return -ENOMEM; -- cgit v1.1 From 64a3ca5f7ec2606b03be4a65736164a5373732ed Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Apr 2006 22:53:03 -0700 Subject: [PATCH] mm/migrate.c: don't export a static function EXPORT_SYMBOL'ing of a static function is not a good idea. Signed-off-by: Adrian Bunk Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 20b95db..d444229 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -173,7 +173,6 @@ unlock_retry: retry: return -EAGAIN; } -EXPORT_SYMBOL(swap_page); /* * Remove references for a page and establish the new page with the correct -- cgit v1.1 From fd5403c79bc21819f6e0c40ba098cea8b6a418bd Mon Sep 17 00:00:00 2001 From: Coywolf Qi Hunt Date: Mon, 10 Apr 2006 22:54:35 -0700 Subject: [PATCH] page-writeback comment fixes Signed-off-by: Coywolf Qi Hunt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6dcce3a..75d7f48 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -72,13 +72,12 @@ int dirty_background_ratio = 10; int vm_dirty_ratio = 40; /* - * The interval between `kupdate'-style writebacks, in centiseconds - * (hundredths of a second) + * The interval between `kupdate'-style writebacks, in jiffies */ int dirty_writeback_interval = 5 * HZ; /* - * The longest number of centiseconds for which data is allowed to remain dirty + * The longest number of jiffies for which data is allowed to remain dirty */ int dirty_expire_interval = 30 * HZ; -- cgit v1.1 From 69cf0fac6052c5bd3fb3469a41d4216e926028f8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 17 Apr 2006 22:46:32 +0100 Subject: [PATCH] Fix MADV_REMOVE protection checking madvise_remove needs to respect file and mmap protections. Signed-off-by: Hugh Dickins [ Will the real CVE-2006-1524 stand up, please.. ] Signed-off-by: Linus Torvalds --- mm/madvise.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index af3d573..4e19615 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -168,6 +168,9 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + mapping = vma->vm_file->f_mapping; offset = (loff_t)(start - vma->vm_start) -- cgit v1.1 From 75129e297e861e6c61038aa4cdbf604b022de4ff Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Tue, 18 Apr 2006 22:20:33 -0700 Subject: [PATCH] mm/slob.c: for_each_possible_cpu(), not NR_CPUS Convert for-loops that explicitly reference "NR_CPUS" into the potentially more efficient for_each_possible_cpu() construct. Signed-off-by: John Hawkes Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 9bcc7e2..a68255b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -354,9 +354,7 @@ void *__alloc_percpu(size_t size) if (!pdata) return NULL; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_possible_cpu(i) { pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); if (!pdata->ptrs[i]) goto unwind_oom; @@ -383,11 +381,9 @@ free_percpu(const void *objp) int i; struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; + for_each_possible_cpu(i) kfree(p->ptrs[i]); - } + kfree(p); } EXPORT_SYMBOL(free_percpu); -- cgit v1.1 From 97c2c9b84d0c1edf4926b13661d5af3f0edccbce Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Apr 2006 22:20:38 -0700 Subject: [PATCH] oom-kill: mm locking fix Dave Peterson points out that badness() is playing with mm_structs without taking a reference on them. mmput() can sleep, so taking a reference here (inside tasklist_lock) is hard. Fix it up via task_lock() instead. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 78747af..9a643c4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -46,15 +46,25 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) { unsigned long points, cpu_time, run_time, s; - struct list_head *tsk; + struct mm_struct *mm; + struct task_struct *child; - if (!p->mm) + task_lock(p); + mm = p->mm; + if (!mm) { + task_unlock(p); return 0; + } /* * The memory size of the process is the basis for the badness. */ - points = p->mm->total_vm; + points = mm->total_vm; + + /* + * After this unlock we can no longer dereference local variable `mm' + */ + task_unlock(p); /* * Processes which fork a lot of child processes are likely @@ -64,11 +74,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * child is eating the vast majority of memory, adding only half * to the parents will make the child our kill candidate of choice. */ - list_for_each(tsk, &p->children) { - struct task_struct *chld; - chld = list_entry(tsk, struct task_struct, sibling); - if (chld->mm != p->mm && chld->mm) - points += chld->mm->total_vm/2 + 1; + list_for_each_entry(child, &p->children, sibling) { + task_lock(child); + if (child->mm != mm && child->mm) + points += child->mm->total_vm/2 + 1; + task_unlock(child); } /* -- cgit v1.1 From 013159227b840dfd441bd2e4c8b4d77ffb3cc42e Mon Sep 17 00:00:00 2001 From: Dave Peterson Date: Tue, 18 Apr 2006 22:20:44 -0700 Subject: [PATCH] mm: fix mm_struct reference counting bugs in mm/oom_kill.c Fix oom_kill_task() so it doesn't call mmput() (which may sleep) while holding tasklist_lock. Signed-off-by: David S. Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9a643c4..042e643 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,17 +254,24 @@ static void __oom_kill_task(task_t *p, const char *message) force_sig(SIGKILL, p); } -static struct mm_struct *oom_kill_task(task_t *p, const char *message) +static int oom_kill_task(task_t *p, const char *message) { - struct mm_struct *mm = get_task_mm(p); + struct mm_struct *mm; task_t * g, * q; - if (!mm) - return NULL; - if (mm == &init_mm) { - mmput(mm); - return NULL; - } + mm = p->mm; + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL || mm == &init_mm) + return 1; __oom_kill_task(p, message); /* @@ -276,13 +283,12 @@ static struct mm_struct *oom_kill_task(task_t *p, const char *message) __oom_kill_task(q, message); while_each_thread(g, q); - return mm; + return 0; } -static struct mm_struct *oom_kill_process(struct task_struct *p, - unsigned long points, const char *message) +static int oom_kill_process(struct task_struct *p, unsigned long points, + const char *message) { - struct mm_struct *mm; struct task_struct *c; struct list_head *tsk; @@ -293,9 +299,8 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, c = list_entry(tsk, struct task_struct, sibling); if (c->mm == p->mm) continue; - mm = oom_kill_task(c, message); - if (mm) - return mm; + if (!oom_kill_task(c, message)) + return 0; } return oom_kill_task(p, message); } @@ -310,7 +315,6 @@ static struct mm_struct *oom_kill_process(struct task_struct *p, */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - struct mm_struct *mm = NULL; task_t *p; unsigned long points = 0; @@ -330,12 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) */ switch (constrained_alloc(zonelist, gfp_mask)) { case CONSTRAINT_MEMORY_POLICY: - mm = oom_kill_process(current, points, + oom_kill_process(current, points, "No available memory (MPOL_BIND)"); break; case CONSTRAINT_CPUSET: - mm = oom_kill_process(current, points, + oom_kill_process(current, points, "No available memory in cpuset"); break; @@ -357,8 +361,7 @@ retry: panic("Out of memory and no killable processes...\n"); } - mm = oom_kill_process(p, points, "Out of memory"); - if (!mm) + if (oom_kill_process(p, points, "Out of memory")) goto retry; break; @@ -367,8 +370,6 @@ retry: out: read_unlock(&tasklist_lock); cpuset_unlock(); - if (mm) - mmput(mm); /* * Give "p" a good chance of killing itself before we -- cgit v1.1 From 6aa3001b239b387d98a7f945e4a51edeb59e4f2d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Apr 2006 22:20:52 -0700 Subject: [PATCH] page_alloc.c: buddy handling cleanup Fix up some whitespace damage. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97d6827..123c605 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,11 +232,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); __SetPageBuddy(page); } @@ -299,9 +301,9 @@ static inline int page_is_buddy(struct page *page, int order) if (PageBuddy(page) && page_order(page) == order) { BUG_ON(page_count(page) != 0); - return 1; + return 1; } - return 0; + return 0; } /* -- cgit v1.1 From 6d472be37896b1c41b50f3da124f8b7718ba7797 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 20 Apr 2006 02:43:12 -0700 Subject: [PATCH] Remove cond_resched in gather_stats() gather_stats() is called with a spinlock held from check_pte_range. We cannot reschedule with a lock held. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dec8249..8778f58 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1761,7 +1761,6 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) md->mapcount_max = count; md->node[page_to_nid(page)]++; - cond_resched(); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.1