From 818cf5909701806285d977f7a9365c5cadb062a7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 23 Apr 2009 09:58:22 +0300 Subject: slub: enforce MAX_ORDER slub_max_order may not be equal to or greater than MAX_ORDER. Additionally, if a single object cannot be placed in a slab of slub_max_order, it still must allocate slabs below MAX_ORDER. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7ab54ec..0e1247e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1909,7 +1909,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -2522,6 +2522,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } -- cgit v1.1 From bc43f75cd9815833b27831600ccade672edb5e43 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 30 Apr 2009 15:08:08 -0700 Subject: mm: fix pageref leak in do_swap_page() By the time the memory cgroup code is notified about a swapin we already hold a reference on the fault page. If the cgroup callback fails make sure to unlock AND release the page reference which was taken by lookup_swap_cach(), or we leak the reference. Signed-off-by: Johannes Weiner Cc: Balbir Singh Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index cf6873e..6a4ef0f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2458,8 +2458,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; - unlock_page(page); - goto out; + goto out_page; } /* @@ -2521,6 +2520,7 @@ out: out_nomap: mem_cgroup_cancel_charge_swapin(ptr); pte_unmap_unlock(page_table, ptl); +out_page: unlock_page(page); page_cache_release(page); return ret; -- cgit v1.1 From c0bd3f63ce01a1757dbce6373122a05fbf99ced7 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 30 Apr 2009 15:08:11 -0700 Subject: memcg: fix try_get_mem_cgroup_from_swapcache() This is a bugfix for commit 3c776e64660028236313f0e54f3a9945764422df ("memcg: charge swapcache to proper memcg"). Used bit of swapcache is solid under page lock, but considering move_account, pc->mem_cgroup is not. We need lock_page_cgroup() anyway. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e44fb0f..575203a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit of swapcache is solid under page lock. - */ + lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) @@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) mem = NULL; rcu_read_unlock(); } + unlock_page_cgroup(pc); return mem; } -- cgit v1.1 From b827e496c893de0c0f142abfaeb8730a2fd6b37f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 30 Apr 2009 15:08:16 -0700 Subject: mm: close page_mkwrite races Change page_mkwrite to allow implementations to return with the page locked, and also change it's callers (in page fault paths) to hold the lock until the page is marked dirty. This allows the filesystem to have full control of page dirtying events coming from the VM. Rather than simply hold the page locked over the page_mkwrite call, we call page_mkwrite with the page unlocked and allow callers to return with it locked, so filesystems can avoid LOR conditions with page lock. The problem with the current scheme is this: a filesystem that wants to associate some metadata with a page as long as the page is dirty, will perform this manipulation in its ->page_mkwrite. It currently then must return with the page unlocked and may not hold any other locks (according to existing page_mkwrite convention). In this window, the VM could write out the page, clearing page-dirty. The filesystem has no good way to detect that a dirty pte is about to be attached, so it will happily write out the page, at which point, the filesystem may manipulate the metadata to reflect that the page is no longer dirty. It is not always possible to perform the required metadata manipulation in ->set_page_dirty, because that function cannot block or fail. The filesystem may need to allocate some data structure, for example. And the VM cannot mark the pte dirty before page_mkwrite, because page_mkwrite is allowed to fail, so we must not allow any window where the page could be written to if page_mkwrite does fail. This solution of holding the page locked over the 3 critical operations (page_mkwrite, setting the pte dirty, and finally setting the page dirty) closes out races nicely, preventing page cleaning for writeout being initiated in that window. This provides the filesystem with a strong synchronisation against the VM here. - Sage needs this race closed for ceph filesystem. - Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913). - I need it for fsblock. - I suspect other filesystems may need it too (eg. btrfs). - I have converted buffer.c to the new locking. Even simple block allocation under dirty pages might be susceptible to i_size changing under partial page at the end of file (we also have a buffer.c-side problem here, but it cannot be fixed properly without this patch). - Other filesystems (eg. NFS, maybe btrfs) will need to change their page_mkwrite functions themselves. [ This also moves page_mkwrite another step closer to fault, which should eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a filesystem calldown and page lock/unlock cycle in __do_fault. ] [akpm@linux-foundation.org: fix derefs of NULL ->mapping] Cc: Sage Weil Cc: Trond Myklebust Signed-off-by: Nick Piggin Cc: Valdis Kletnieks Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 108 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6a4ef0f..4126dd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = tmp; goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(old_page); + if (!old_page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(old_page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(old_page)); /* * Since we dropped the lock we need to revalidate @@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); goto unlock; + } page_mkwrite = 1; } @@ -2094,9 +2105,6 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty @@ -2105,16 +2113,41 @@ unlock: * * do_no_page is protected similarly. */ - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } return ret; oom_free_new: page_cache_release(new_page); oom: - if (old_page) + if (old_page) { + if (page_mkwrite) { + unlock_page(old_page); + page_cache_release(old_page); + } page_cache_release(old_page); + } return VM_FAULT_OOM; unwritable_page: @@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, int tmp; unlock_page(page); - vmf.flags |= FAULT_FLAG_MKWRITE; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; - anon = 1; /* no anon but release vmf.page */ - goto out_unlocked; - } - lock_page(page); - /* - * XXX: this is not quite right (racy vs - * invalidate) to unlock and relock the page - * like this, however a better fix requires - * reworking page_mkwrite locking API, which - * is better done later. - */ - if (!page->mapping) { - ret = 0; - anon = 1; /* no anon but release vmf.page */ - goto out; + goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(page)); page_mkwrite = 1; } } @@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); out: - unlock_page(vmf.page); -out_unlocked: - if (anon) - page_cache_release(vmf.page); - else if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); + if (dirty_page) { + struct address_space *mapping = page->mapping; - set_page_dirty_balance(dirty_page, page_mkwrite); + if (set_page_dirty(dirty_page)) + page_mkwrite = 1; + unlock_page(dirty_page); put_page(dirty_page); + if (page_mkwrite && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + } else { + unlock_page(vmf.page); + if (anon) + page_cache_release(vmf.page); } return ret; + +unwritable_page: + page_cache_release(page); + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.1 From ae3abae64f177586be55b04a7fb7047a34b21a3e Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 30 Apr 2009 15:08:19 -0700 Subject: memcg: fix mem_cgroup_shrink_usage() Current mem_cgroup_shrink_usage() has two problems. 1. It doesn't call mem_cgroup_out_of_memory and doesn't update last_oom_jiffies, so pagefault_out_of_memory invokes global OOM. 2. Considering hierarchy, shrinking has to be done from the mem_over_limit, not from the memcg which the page would be charged to. mem_cgroup_try_charge_swapin() does all of these things properly, so we use it and call cancel_charge_swapin when it succeeded. The name of "shrink_usage" is not appropriate for this behavior, so we change it too. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Paul Menage Cc: Dhaval Giani Cc: Daisuke Nishimura Cc: YAMAMOTO Takashi Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 ++++++++++++--------------------- mm/shmem.c | 8 ++++++-- 2 files changed, 18 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 575203a..01c2d8f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1617,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, } /* - * A call to try to shrink memory usage under specified resource controller. - * This is typically used for page reclaiming for shmem for reducing side - * effect of page allocation from shmem, which is used by some mem_cgroup. + * A call to try to shrink memory usage on charge failure at shmem's swapin. + * Calling hierarchical_reclaim is not enough because we should update + * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. + * Moreover considering hierarchy, we should reclaim from the mem_over_limit, + * not from the memcg which this page would be charged to. + * try_charge_swapin does all of these works properly. */ -int mem_cgroup_shrink_usage(struct page *page, +int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *mem = NULL; - int progress = 0; - int retry = MEM_CGROUP_RECLAIM_RETRIES; + int ret; if (mem_cgroup_disabled()) return 0; - if (page) - mem = try_get_mem_cgroup_from_swapcache(page); - if (!mem && mm) - mem = try_get_mem_cgroup_from_mm(mm); - if (unlikely(!mem)) - return 0; - do { - progress = mem_cgroup_hierarchical_reclaim(mem, - gfp_mask, true, false); - progress += mem_cgroup_check_under_limit(mem); - } while (!progress && --retry); + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - css_put(&mem->css); - if (!retry) - return -ENOMEM; - return 0; + return ret; } static DEFINE_MUTEX(set_limit_mutex); diff --git a/mm/shmem.c b/mm/shmem.c index f9cb20e..b25f95c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1340,8 +1340,12 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { - /* allow reclaim from this memory cgroup */ - error = mem_cgroup_shrink_usage(swappage, + /* + * reclaim from proper memory cgroup and + * call memcg's OOM if needed. + */ + error = mem_cgroup_shmem_charge_fallback( + swappage, current->mm, gfp); if (error) { -- cgit v1.1 From 00a62ce91e554198ef28234c91c36f850f5a3bc9 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 30 Apr 2009 15:08:51 -0700 Subject: mm: fix Committed_AS underflow on large NR_CPUS environment The Committed_AS field can underflow in certain situations: > # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c > 1 Committed_AS: 18446744073709323392 kB > 11 Committed_AS: 18446744073709455488 kB > 6 Committed_AS: 35136 kB > 5 Committed_AS: 18446744073709454400 kB > 7 Committed_AS: 35904 kB > 3 Committed_AS: 18446744073709453248 kB > 2 Committed_AS: 34752 kB > 9 Committed_AS: 18446744073709453248 kB > 8 Committed_AS: 34752 kB > 3 Committed_AS: 18446744073709320960 kB > 7 Committed_AS: 18446744073709454080 kB > 3 Committed_AS: 18446744073709320960 kB > 5 Committed_AS: 18446744073709454080 kB > 6 Committed_AS: 18446744073709320960 kB Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does not check for underflow. But NR_CPUS proportional isn't good calculation. In general, possibility of lock contention is proportional to the number of online cpus, not theorical maximum cpus (NR_CPUS). The current kernel has generic percpu-counter stuff. using it is right way. it makes code simplify and percpu_counter_read_positive() don't make underflow issue. Reported-by: Dave Hansen Signed-off-by: KOSAKI Motohiro Cc: Eric B Munson Cc: Mel Gorman Cc: Christoph Lameter Cc: [All kernel versions] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++------ mm/nommu.c | 13 +++++++------ mm/swap.c | 46 ---------------------------------------------- 3 files changed, 13 insertions(+), 58 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3303d1b..6b7b1a9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; /* * Check that a process has enough memory to allocate a new virtual @@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); @@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 72eda4a..809998a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -62,7 +62,7 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; @@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } @@ -1847,12 +1851,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; + error: vm_unacct_memory(pages); diff --git a/mm/swap.c b/mm/swap.c index bede23c..cb29ae5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, EXPORT_SYMBOL(pagevec_lookup_tag); -#ifdef CONFIG_SMP -/* - * We tolerate a little inaccuracy to avoid ping-ponging the counter between - * CPUs - */ -#define ACCT_THRESHOLD max(16, NR_CPUS * 2) - -static DEFINE_PER_CPU(long, committed_space); - -void vm_acct_memory(long pages) -{ - long *local; - - preempt_disable(); - local = &__get_cpu_var(committed_space); - *local += pages; - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { - atomic_long_add(*local, &vm_committed_space); - *local = 0; - } - preempt_enable(); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* Drop the CPU's cached committed space back into the central pool. */ -static int cpu_swap_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - long *committed; - - committed = &per_cpu(committed_space, (long)hcpu); - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - atomic_long_add(*committed, &vm_committed_space); - *committed = 0; - drain_cpu_pagevecs((long)hcpu); - } - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ -#endif /* CONFIG_SMP */ - /* * Perform any setup for the swap system */ @@ -554,7 +511,4 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ -#ifdef CONFIG_HOTPLUG_CPU - hotcpu_notifier(cpu_swap_callback, 0); -#endif } -- cgit v1.1 From 8713e01295140f674a41f2199b0f7ca99dfb69d5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 30 Apr 2009 15:08:55 -0700 Subject: vmscan: avoid multiplication overflow in shrink_zone() Local variable `scan' can overflow on zones which are larger than (2G * 4k) / 100 = 80GB. Making it 64-bit on 64-bit will fix that up. Cc: KOSAKI Motohiro Cc: Wu Fengguang Cc: Peter Zijlstra Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index eac9577..5fa3eda 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1471,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone, for_each_evictable_lru(l) { int file = is_file_lru(l); - int scan; + unsigned long scan; scan = zone_nr_pages(zone, sc, l); if (priority) { -- cgit v1.1 From a425a638c858fd10370b573bde81df3ba500e271 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 May 2009 16:37:17 +0100 Subject: Ignore madvise(MADV_WILLNEED) for hugetlbfs-backed regions madvise(MADV_WILLNEED) forces page cache readahead on a range of memory backed by a file. The assumption is made that the page required is order-0 and "normal" page cache. On hugetlbfs, this assumption is not true and order-0 pages are allocated and inserted into the hugetlbfs page cache. This leaks hugetlbfs page reservations and can cause BUGs to trigger related to corrupted page tables. This patch causes MADV_WILLNEED to be ignored for hugetlbfs-backed regions. Signed-off-by: Mel Gorman Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574..36d6ea2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -112,6 +112,14 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; + /* + * Page cache readahead assumes page cache pages are order-0 which + * is not the case for hugetlbfs. Do not give a bad return value + * but ignore the advice. + */ + if (vma->vm_flags & VM_HUGETLB) + return 0; + if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ return 0; -- cgit v1.1 From 1eb5ac6466d4be7b15b38ce3ab709600f1bc891f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 5 May 2009 19:13:44 +1000 Subject: mm: SLUB fix reclaim_state SLUB does not correctly account reclaim_state.reclaimed_slab, so it will break memory reclaim. Account it like SLAB does. Cc: stable@kernel.org Cc: linux-mm@kvack.org Cc: Matt Mackall Acked-by: Christoph Lameter Signed-off-by: Nick Piggin Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7ab54ec..aa34913 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -9,6 +9,7 @@ */ #include +#include /* struct reclaim_state */ #include #include #include @@ -1170,6 +1171,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); } -- cgit v1.1 From 1f0532eb617d28f65c93593a1491f662f14f7eac Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 5 May 2009 19:13:45 +1000 Subject: mm: SLOB fix reclaim_state SLOB does not correctly account reclaim_state.reclaimed_slab, so it will break memory reclaim. Account it like SLAB does. Cc: stable@kernel.org Cc: linux-mm@kvack.org Acked-by: Matt Mackall Acked-by: Christoph Lameter Signed-off-by: Nick Piggin Signed-off-by: Pekka Enberg --- mm/slob.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index a2d4ab3..f92e66d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -60,6 +60,7 @@ #include #include #include +#include /* struct reclaim_state */ #include #include #include @@ -255,6 +256,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) static void slob_free_pages(void *b, int order) { + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; free_pages((unsigned long)b, order); } @@ -407,7 +410,7 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); - free_page((unsigned long)b); + slob_free_pages(b, 0); return; } -- cgit v1.1 From 184101bf143ac96d62b3dcc17e7b3550f98d3350 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 May 2009 16:02:55 -0700 Subject: oom: prevent livelock when oom_kill_allocating_task is set When /proc/sys/vm/oom_kill_allocating_task is set for large systems that want to avoid the lengthy tasklist scan, it's possible to livelock if current is ineligible for oom kill. This normally happens when it is set to OOM_DISABLE, but is also possible if any threads are sharing the same ->mm with a different tgid. So change __out_of_memory() to fall back to the full task-list scan if it was unable to kill `current'. Cc: Nick Piggin Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2f3166e..92bcf1d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ static void __out_of_memory(gfp_t gfp_mask, int order) { - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)"); - - } else { - unsigned long points; - struct task_struct *p; - -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); + struct task_struct *p; + unsigned long points; - if (PTR_ERR(p) == -1UL) + if (sysctl_oom_kill_allocating_task) + if (!oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)")) return; +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } + if (PTR_ERR(p) == -1UL) + return; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; } /* -- cgit v1.1 From 2498ce42d3a4d1a498f1df4884da960087547db7 Mon Sep 17 00:00:00 2001 From: Ralph Wuerthner Date: Wed, 6 May 2009 16:02:59 -0700 Subject: alloc_vmap_area: fix memory leak If alloc_vmap_area() fails the allocated struct vmap_area has to be freed. Signed-off-by: Ralph Wuerthner Reviewed-by: Christoph Lameter Reviewed-by: Minchan Kim Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fab1987..083716e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -402,6 +402,7 @@ overflow: printk(KERN_WARNING "vmap allocation for size %lu failed: " "use vmalloc= to increase size.\n", size); + kfree(va); return ERR_PTR(-EBUSY); } -- cgit v1.1 From 9155203a5de94278525647b16733f0c315f3b786 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 May 2009 16:03:02 -0700 Subject: mm: use roundown_pow_of_two() in zone_batchsize() Use roundown_pow_of_two(N) in zone_batchsize() rather than (1 << (fls(N)-1)) as they are equivalent, and with the former it is easier to see what is going on. Signed-off-by: David Howells Tested-by: Lanttor Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2f2699..8add7da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2706,7 +2706,7 @@ static int zone_batchsize(struct zone *zone) * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << (fls(batch + batch/2)-1)) - 1; + batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; } -- cgit v1.1 From 3a6be87fd1e5cdbbc3b6a14d02a3efa9ecba1d3f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 May 2009 16:03:03 -0700 Subject: nommu: clamp zone_batchsize() to 0 under NOMMU conditions Clamp zone_batchsize() to 0 under NOMMU conditions to stop free_hot_cold_page() from queueing and batching frees. The problem is that under NOMMU conditions it is really important to be able to allocate large contiguous chunks of memory, but when munmap() or exit_mmap() releases big stretches of memory, return of these to the buddy allocator can be deferred, and when it does finally happen, it can be in small chunks. Whilst the fragmentation this incurs isn't so much of a problem under MMU conditions as userspace VM is glued together from individual pages with the aid of the MMU, it is a real problem if there isn't an MMU. By clamping the page freeing queue size to 0, pages are returned to the allocator immediately, and the buddy detector is more likely to be able to glue them together into large chunks immediately, and fragmentation is less likely to occur. By disabling batching of frees, and by turning off the trimming of excess space during boot, Coldfire can manage to boot. Reported-by: Lanttor Guo Signed-off-by: David Howells Tested-by: Lanttor Guo Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8add7da..fe753ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) static int zone_batchsize(struct zone *zone) { +#ifdef CONFIG_MMU int batch; /* @@ -2709,6 +2710,23 @@ static int zone_batchsize(struct zone *zone) batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) -- cgit v1.1 From fc4d5c292b68ef02514d2072dcbf82d090c34875 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 May 2009 16:03:05 -0700 Subject: nommu: make the initial mmap allocation excess behaviour Kconfig configurable NOMMU mmap() has an option controlled by a sysctl variable that determines whether the allocations made by do_mmap_private() should have the excess space trimmed off and returned to the allocator. Make the initial setting of this variable a Kconfig configuration option. The reason there can be excess space is that the allocator only allocates in power-of-2 size chunks, but mmap()'s can be made in sizes that aren't a power of 2. There are two alternatives: (1) Keep the excess as dead space. The dead space then remains unused for the lifetime of the mapping. Mappings of shared objects such as libc, ld.so or busybox's text segment may retain their dead space forever. (2) Return the excess to the allocator. This means that the dead space is limited to less than a page per mapping, but it means that for a transient process, there's more chance of fragmentation as the excess space may be reused fairly quickly. During the boot process, a lot of transient processes are created, and this can cause a lot of fragmentation as the pagecache and various slabs grow greatly during this time. By turning off the trimming of excess space during boot and disabling batching of frees, Coldfire can manage to boot. A better way of doing things might be to have /sbin/init turn this option off. By that point libc, ld.so and init - which are all long-duration processes - have all been loaded and trimmed. Reported-by: Lanttor Guo Signed-off-by: David Howells Tested-by: Lanttor Guo Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 28 ++++++++++++++++++++++++++++ mm/nommu.c | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 57971d2..c2b57d8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool + +config NOMMU_INITIAL_TRIM_EXCESS + int "Turn on mmap() excess space trimming before booting" + depends on !MMU + default 1 + help + The NOMMU mmap() frequently needs to allocate large contiguous chunks + of memory on which to store mappings, but it can only ask the system + allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently + more than it requires. To deal with this, mmap() is able to trim off + the excess and return it to the allocator. + + If trimming is enabled, the excess is trimmed off and returned to the + system allocator, which can cause extra fragmentation, particularly + if there are a lot of transient processes. + + If trimming is disabled, the excess is kept, but not used, which for + long-term mappings means that the space is wasted. + + Trimming can be dynamically controlled through a sysctl option + (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of + excess pages there must be before trimming should occur, or zero if + no trimming is to occur. + + This option specifies the initial value of this option. The default + of 1 says that all excess pages should be trimmed. + + See Documentation/nommu-mmap.txt for more information. diff --git a/mm/nommu.c b/mm/nommu.c index 809998a..67cd1a4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -66,7 +66,7 @@ struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; -int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -- cgit v1.1 From 8c9ed899b44c19e81859fbb0e9d659fe2f8630fc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 7 May 2009 11:41:37 +0100 Subject: NOMMU: Don't check vm_region::vm_start is page aligned in add_nommu_region() Don't check vm_region::vm_start is page aligned in add_nommu_region() because the region may reflect some non-page-aligned mapped file, such as could be obtained from RomFS XIP. Signed-off-by: David Howells Acked-by: Greg Ungerer Signed-off-by: Linus Torvalds --- mm/nommu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 67cd1a4..b571ef7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -515,8 +515,6 @@ static void add_nommu_region(struct vm_region *region) validate_nommu_regions(); - BUG_ON(region->vm_start & ~PAGE_MASK); - parent = NULL; p = &nommu_region_tree.rb_node; while (*p) { -- cgit v1.1 From 0f181328287db30671e9997329cff71395d4af8b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2009 08:29:12 -0700 Subject: Revert "Ignore madvise(MADV_WILLNEED) for hugetlbfs-backed regions" This reverts commit a425a638c858fd10370b573bde81df3ba500e271. Now that the previous commit removed the "readpage" actor for hugetlb files, read-ahead will no longer mess up the mapping, and there's no longer any reason to treat hugetlbfs mappings specially. Tested-and-acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 36d6ea2..b9ce574 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -112,14 +112,6 @@ static long madvise_willneed(struct vm_area_struct * vma, if (!file) return -EBADF; - /* - * Page cache readahead assumes page cache pages are order-0 which - * is not the case for hugetlbfs. Do not give a bad return value - * but ignore the advice. - */ - if (vma->vm_flags & VM_HUGETLB) - return 0; - if (file->f_mapping->a_ops->get_xip_mem) { /* no bad return value, but ignore advice */ return 0; -- cgit v1.1 From cd17cbfda004fe5f406c01b318c6378d9895896f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 May 2009 11:32:24 +0200 Subject: Revert "mm: add /proc controls for pdflush threads" This reverts commit fafd688e4c0c34da0f3de909881117d374e4c7af. Work is progressing to switch away from pdflush as the process backing for flushing out dirty data. So it seems pointless to add more knobs to control pdflush threads. The original author of the patch did not have any specific use cases for adding the knobs, so we can easily revert this before 2.6.30 to avoid having to maintain this API forever. Signed-off-by: Jens Axboe --- mm/pdflush.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index f2caf96..235ac44 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock); int nr_pdflush_threads = 0; /* - * The max/min number of pdflush threads. R/W by sysctl at - * /proc/sys/vm/nr_pdflush_threads_max/min - */ -int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS; -int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS; - - -/* * The time at which the pdflush thread pool last went empty */ static unsigned long last_empty_jifs; @@ -76,7 +68,7 @@ static unsigned long last_empty_jifs; * Thread pool management algorithm: * * - The minimum and maximum number of pdflush instances are bound - * by nr_pdflush_threads_min and nr_pdflush_threads_max. + * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. * * - If there have been no idle pdflush instances for 1 second, create * a new one. @@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work) * To throttle creation, we reset last_empty_jifs. */ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list) && - nr_pdflush_threads < nr_pdflush_threads_max) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); + if (list_empty(&pdflush_list)) { + if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { + last_empty_jifs = jiffies; + nr_pdflush_threads++; + spin_unlock_irq(&pdflush_lock); + start_one_pdflush_thread(); + spin_lock_irq(&pdflush_lock); + } } } @@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work) */ if (list_empty(&pdflush_list)) continue; - if (nr_pdflush_threads <= nr_pdflush_threads_min) + if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { @@ -266,9 +259,9 @@ static int __init pdflush_init(void) * Pre-set nr_pdflush_threads... If we fail to create, * the count will be decremented. */ - nr_pdflush_threads = nr_pdflush_threads_min; + nr_pdflush_threads = MIN_PDFLUSH_THREADS; - for (i = 0; i < nr_pdflush_threads_min; i++) + for (i = 0; i < MIN_PDFLUSH_THREADS; i++) start_one_pdflush_thread(); return 0; } -- cgit v1.1 From 22ef37eed673587ac984965dc88ba94c68873291 Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Sat, 16 May 2009 22:56:28 -0700 Subject: page-writeback: fix the calculation of the oldest_jif in wb_kupdate() wb_kupdate() function has a bug on linux-2.6.30-rc5. This bug causes generic_sync_sb_inodes() to start to write inodes back much earlier than our expectations because it miscalculates oldest_jif in wb_kupdate(). This bug was introduced in 704503d836042d4a4c7685b7036e7de0418fbc0f ('mm: fix proc_dointvec_userhz_jiffies "breakage"'). Signed-off-by: Toshiyuki Okajima Cc: Alexey Dobriyan Cc: Peter Zijlstra Cc: Nick Piggin Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 30351f0..bb553c3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -94,12 +94,12 @@ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ -unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ /* * The longest time for which data is allowed to remain dirty */ -unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -770,7 +770,7 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); start_jif = jiffies; next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + -- cgit v1.1 From eb33575cf67d3f35fa2510210ef92631266e2465 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 13 May 2009 17:34:48 +0100 Subject: [ARM] Double check memmap is actually valid with a memmap has unexpected holes V2 pfn_valid() is meant to be able to tell if a given PFN has valid memmap associated with it or not. In FLATMEM, it is expected that holes always have valid memmap as long as there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed that a valid section has a memmap for the entire section. However, ARM and maybe other embedded architectures in the future free memmap backing holes to save memory on the assumption the memmap is never used. The page_zone linkages are then broken even though pfn_valid() returns true. A walker of the full memmap must then do this additional check to ensure the memmap they are looking at is sane by making sure the zone and PFN linkages are still valid. This is expensive, but walkers of the full memmap are extremely rare. This was caught before for FLATMEM and hacked around but it hits again for SPARSEMEM because the page_zone linkages can look ok where the PFN linkages are totally screwed. This looks like a hatchet job but the reality is that any clean solution would end up consumning all the memory saved by punching these unexpected holes in the memmap. For example, we tried marking the memmap within the section invalid but the section size exceeds the size of the hole in most cases so pfn_valid() starts returning false where valid memmap exists. Shrinking the size of the section would increase memory consumption offsetting the gains. This patch identifies when an architecture is punching unexpected holes in the memmap that the memory model cannot automatically detect and sets ARCH_HAS_HOLES_MEMORYMODEL. At the moment, this is restricted to EP93xx which is the model sub-architecture this has been reported on but may expand later. When set, walkers of the full memmap must call memmap_valid_within() for each PFN and passing in what it expects the page and zone to be for that PFN. If it finds the linkages to be broken, it assumes the memmap is invalid for that PFN. Signed-off-by: Mel Gorman Signed-off-by: Russell King --- mm/mmzone.c | 15 +++++++++++++++ mm/vmstat.c | 19 ++++--------------- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 16ce8b9..f5b7d17 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -6,6 +6,7 @@ #include +#include #include #include @@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, *zone = zonelist_zone(z); return z; } + +#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL +int memmap_valid_within(unsigned long pfn, + struct page *page, struct zone *zone) +{ + if (page_to_pfn(page) != pfn) + return 0; + + if (page_zone(page) != zone) + return 0; + + return 1; +} +#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 66f6130..74d66db 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); -#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES - /* - * Ordinarily, memory holes in flatmem still have a valid - * memmap for the PFN range. However, an architecture for - * embedded systems (e.g. ARM) can free up the memmap backing - * holes to save memory on the assumption the memmap is - * never used. The page_zone linkages are then broken even - * though pfn_valid() returns true. Skip the page if the - * linkages are broken. Even if this test passed, the impact - * is that the counters for the movable type are off but - * fragmentation monitoring is likely meaningless on small - * systems. - */ - if (page_zone(page) != zone) + + /* Watch for unexpected holes punched in the memmap */ + if (!memmap_valid_within(pfn, page, zone)) continue; -#endif + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) -- cgit v1.1 From 98f32602d42951e61a059685f842aa7d778ffab0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 21 May 2009 20:33:58 +0100 Subject: hugh: update email address My old address will shut down in a few days time: remove it from the tree, and add a tmpfs (shmem filesystem) maintainer entry with the new address. Signed-off-by: Hugh Dickins Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 1652166..23122af 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -14,7 +14,7 @@ * Original design by Rik van Riel 2001 * File methods by Dave McCracken 2003, 2004 * Anonymous methods by Andrea Arcangeli 2004 - * Contributions by Hugh Dickins 2003, 2004 + * Contributions by Hugh Dickins 2003, 2004 */ /* -- cgit v1.1 From 6d2661ede5f20f968422e790af3334908c3bc857 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 28 May 2009 14:34:19 -0700 Subject: oom: fix possible oom_dump_tasks NULL pointer When /proc/sys/vm/oom_dump_tasks is enabled, it is possible to get a NULL pointer for tasks that have detached mm's since task_lock() is not held during the tasklist scan. Add the task_lock(). Acked-by: Nick Piggin Acked-by: Mel Gorman Cc: Rik van Riel Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 92bcf1d..a7b2460 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -284,22 +284,28 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); do_each_thread(g, p) { - /* - * total_vm and rss sizes do not exist for tasks with a - * detached mm so there's no need to report them. - */ - if (!p->mm) - continue; + struct mm_struct *mm; + if (mem && !task_in_mem_cgroup(p, mem)) continue; if (!thread_group_leader(p)) continue; task_lock(p); + mm = p->mm; + if (!mm) { + /* + * total_vm and rss sizes do not exist for tasks with no + * mm so there's no need to report them; they can't be + * oom killed anyway. + */ + task_unlock(p); + continue; + } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, - p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), - p->oomkilladj, p->comm); + p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } -- cgit v1.1 From e767e0561d7fd2333df1921f1ab4176211f9036b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 28 May 2009 14:34:28 -0700 Subject: memcg: fix deadlock between lock_page_cgroup and mapping tree_lock mapping->tree_lock can be acquired from interrupt context. Then, following dead lock can occur. Assume "A" as a page. CPU0: lock_page_cgroup(A) interrupted -> take mapping->tree_lock. CPU1: take mapping->tree_lock -> lock_page_cgroup(A) This patch tries to fix above deadlock by moving memcg's hook to out of mapping->tree_lock. charge/uncharge of pagecache/swapcache is protected by page lock, not tree_lock. After this patch, lock_page_cgroup() is not called under mapping->tree_lock. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 +++--- mm/memcontrol.c | 4 +++- mm/swap_state.c | 4 +--- mm/truncate.c | 1 + mm/vmscan.c | 2 ++ 5 files changed, 10 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 379ff0bc..1b60f30 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); - mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } - - spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01c2d8f..4a747a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1488,8 +1488,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +#ifdef CONFIG_SWAP /* - * called from __delete_from_swap_cache() and drop "page" account. + * called after __delete_from_swap_cache() and drop "page" account. * memcg information is recorded to swap_cgroup of "ent" */ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) @@ -1506,6 +1507,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) if (memcg) css_put(&memcg->css); } +#endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98..1416e7e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t ent = {.val = page_private(page)}; - VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); @@ -121,7 +119,6 @@ void __delete_from_swap_cache(struct page *page) total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); - mem_cgroup_uncharge_swapcache(page, ent); } /** @@ -191,6 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); + mem_cgroup_uncharge_swapcache(page, entry); swap_free(entry); page_cache_release(page); } diff --git a/mm/truncate.c b/mm/truncate.c index 55206fa..12e1579 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -359,6 +359,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); page_cache_release(page); /* pagecache ref */ return 1; failed: diff --git a/mm/vmscan.c b/mm/vmscan.c index 5fa3eda..d254306 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,10 +470,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_swapcache(page, swap); swap_free(swap); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } return 1; -- cgit v1.1 From f83a275dbc5ca1721143698e844243fcadfabf6a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 May 2009 14:34:40 -0700 Subject: mm: account for MAP_SHARED mappings using VM_MAYSHARE and not VM_SHARED in hugetlbfs Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13302 hugetlbfs reserves huge pages but does not fault them at mmap() time to ensure that future faults succeed. The reservation behaviour differs depending on whether the mapping was mapped MAP_SHARED or MAP_PRIVATE. For MAP_SHARED mappings, hugepages are reserved when mmap() is first called and are tracked based on information associated with the inode. Other processes mapping MAP_SHARED use the same reservation. MAP_PRIVATE track the reservations based on the VMA created as part of the mmap() operation. Each process mapping MAP_PRIVATE must make its own reservation. hugetlbfs currently checks if a VMA is MAP_SHARED with the VM_SHARED flag and not VM_MAYSHARE. For file-backed mappings, such as hugetlbfs, VM_SHARED is set only if the mapping is MAP_SHARED and the file was opened read-write. If a shared memory mapping was mapped shared-read-write for populating of data and mapped shared-read-only by other processes, then hugetlbfs would account for the mapping as if it was MAP_PRIVATE. This causes processes to fail to map the file MAP_SHARED even though it should succeed as the reservation is there. This patch alters mm/hugetlb.c and replaces VM_SHARED with VM_MAYSHARE when the intent of the code was to check whether the VMA was mapped MAP_SHARED or MAP_PRIVATE. Signed-off-by: Mel Gorman Cc: Hugh Dickins Cc: Ingo Molnar Cc: Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Cc: Eric B Munson Cc: Adam Litke Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655b..e83ad2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -924,7 +924,7 @@ static long vma_needs_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -949,7 +949,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -1893,7 +1893,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_SHARED) && + if (!(vma->vm_flags & VM_MAYSHARE) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2000,7 +2000,7 @@ retry: clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { int err; struct inode *inode = mapping->host; @@ -2104,7 +2104,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex; } - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } @@ -2289,7 +2289,7 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); @@ -2330,7 +2330,7 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; } -- cgit v1.1 From 46f7e602fb32e02145ef14f8c0ca6d399f0a96b9 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Thu, 28 May 2009 14:34:41 -0700 Subject: memcg: fix build warning and avoid checking for mem != null again and again Fix build warning, "mem_cgroup_is_obsolete defined but not used" when CONFIG_DEBUG_VM is not set. Also avoid checking for !mem again and again. Signed-off-by: Nikanth Karthikesan Acked-by: Pekka Enberg Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a747a2..78eb855 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) -{ - if (!mem) - return true; - return css_is_removed(&mem->css); -} - - /* * Call callback function against all cgroup under hierarchy tree. */ @@ -932,7 +924,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(css_is_removed(&mem->css)); while (1) { int ret; -- cgit v1.1