From cb129820f1e6ccf309510f4eb28df45cb0742005 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 29 Mar 2012 09:45:58 -0700
Subject: percpu: use KERN_CONT in pcpu_dump_alloc_info()

pcpu_dump_alloc_info() was printing continued lines without KERN_CONT.
Use it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Kay Sievers <kay.sievers@vrfy.org>
---
 mm/percpu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index f47af91..f921fdf 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1132,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
 		for (alloc_end += gi->nr_units / upa;
 		     alloc < alloc_end; alloc++) {
 			if (!(alloc % apl)) {
-				printk("\n");
+				printk(KERN_CONT "\n");
 				printk("%spcpu-alloc: ", lvl);
 			}
-			printk("[%0*d] ", group_width, group);
+			printk(KERN_CONT "[%0*d] ", group_width, group);
 
 			for (unit_end += upa; unit < unit_end; unit++)
 				if (gi->cpu_map[unit] != NR_CPUS)
-					printk("%0*d ", cpu_width,
+					printk(KERN_CONT "%0*d ", cpu_width,
 					       gi->cpu_map[unit]);
 				else
-					printk("%s ", empty_str);
+					printk(KERN_CONT "%s ", empty_str);
 		}
 	}
-	printk("\n");
+	printk(KERN_CONT "\n");
 }
 
 /**
-- 
cgit v1.1


From d833049bd20570cbbadeb5228c579f9f3aaa4e03 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Thu, 12 Apr 2012 12:49:11 -0700
Subject: memcg: fix broken boolen expression

action != CPU_DEAD || action != CPU_DEAD_FROZEN is always true.

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df..ea1e879 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2165,7 +2165,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
 
-	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
 	for_each_mem_cgroup(iter)
-- 
cgit v1.1


From 569530fb1b40ab2d2ca147ee79898ac807ebdf90 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Thu, 12 Apr 2012 12:49:13 -0700
Subject: memcg: do not open code accesses to res_counter members

We should use the accessor res_counter_read_u64 for that.

Although a purely cosmetic change is sometimes better delayed, to avoid
conflicting with other people's work, we are starting to have people
touching this code as well, and reproducing the open code behavior
because that's the standard =)

Time to fix it, then.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ea1e879..a7165a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3763,7 +3763,7 @@ move_account:
 			goto try_to_free;
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
-	} while (memcg->res.usage > 0 || ret);
+	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
@@ -3778,7 +3778,7 @@ try_to_free:
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
-	while (nr_retries && memcg->res.usage > 0) {
+	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
 		int progress;
 
 		if (signal_pending(current)) {
-- 
cgit v1.1


From 66aebce747eaf9bc456bf1f1b217d8db843031d0 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 12 Apr 2012 12:49:15 -0700
Subject: hugetlb: fix race condition in hugetlb_fault()

The race is as follows:

Suppose a multi-threaded task forks a new process (on cpu A), thus
bumping up the ref count on all the pages.  While the fork is occurring
(and thus we have marked all the PTEs as read-only), another thread in
the original process (on cpu B) tries to write to a huge page, taking an
access violation from the write-protect and calling hugetlb_cow().  Now,
suppose the fork() fails.  It will undo the COW and decrement the ref
count on the pages, so the ref count on the huge page drops back to 1.
Meanwhile hugetlb_cow() also decrements the ref count by one on the
original page, since the original address space doesn't need it any
more, having copied a new page to replace the original page.  This
leaves the ref count at zero, and when we call unlock_page(), we panic.

	fork on CPU A				fault on CPU B
	=============				==============
	...
	down_write(&parent->mmap_sem);
	down_write_nested(&child->mmap_sem);
	...
	while duplicating vmas
		if error
			break;
	...
	up_write(&child->mmap_sem);
	up_write(&parent->mmap_sem);		...
						down_read(&parent->mmap_sem);
						...
						lock_page(page);
						handle COW
						page_mapcount(old_page) == 2
						alloc and prepare new_page
	...
	handle error
	page_remove_rmap(page);
	put_page(page);
	...
						fold new_page into pte
						page_remove_rmap(page);
						put_page(page);
						...
				oops ==>	unlock_page(page);
						up_read(&parent->mmap_sem);

The solution is to take an extra reference to the page while we are
holding the lock on it.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8ce6f4..cd65cb1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2791,6 +2791,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * so no worry about deadlock.
 	 */
 	page = pte_page(entry);
+	get_page(page);
 	if (page != pagecache_page)
 		lock_page(page);
 
@@ -2822,6 +2823,7 @@ out_page_table_lock:
 	}
 	if (page != pagecache_page)
 		unlock_page(page);
+	put_page(page);
 
 out_mutex:
 	mutex_unlock(&hugetlb_instantiation_mutex);
-- 
cgit v1.1


From 41c93088127df2579e8ca64010929ec9e41d5543 Mon Sep 17 00:00:00 2001
From: Ying Han <yinghan@google.com>
Date: Thu, 12 Apr 2012 12:49:16 -0700
Subject: Revert "mm: vmscan: fix misused nr_reclaimed in
 shrink_mem_cgroup_zone()"

This reverts commit c38446cc65e1f2b3eb8630c53943b94c4f65f670.

Before the commit, the code makes senses to me but not after the commit.
The "nr_reclaimed" is the number of pages reclaimed by scanning through
the memcg's lru lists.  The "nr_to_reclaim" is the target value for the
whole function.  For example, we like to early break the reclaim if
reclaimed 32 pages under direct reclaim (not DEF_PRIORITY).

After the reverted commit, the target "nr_to_reclaim" is decremented each
time by "nr_reclaimed" but we still use it to compare the "nr_reclaimed".
It just doesn't make sense to me...

Signed-off-by: Ying Han <yinghan@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33c332b..1a51868 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2107,12 +2107,7 @@ restart:
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (nr_reclaimed >= nr_to_reclaim)
-			nr_to_reclaim = 0;
-		else
-			nr_to_reclaim -= nr_reclaimed;
-
-		if (!nr_to_reclaim && priority < DEF_PRIORITY)
+		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
 			break;
 	}
 	blk_finish_plug(&plug);
-- 
cgit v1.1


From 9b7f43afd417a6feb80841d30ced4051c362eb5d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 18 Apr 2012 23:34:46 -0700
Subject: memcg: fix Bad page state after replace_page_cache

My 9ce70c0240d0 "memcg: fix deadlock by inverting lrucare nesting" put a
nasty little bug into v3.3's version of mem_cgroup_replace_page_cache(),
sometimes used for FUSE.  Replacing __mem_cgroup_commit_charge_lrucare()
by __mem_cgroup_commit_charge(), I used the "pc" pointer set up earlier:
but it's for oldpage, and needs now to be for newpage.  Once oldpage was
freed, its PageCgroupUsed bit (cleared above but set again here) caused
"Bad page state" messages - and perhaps worse, being missed from newpage.
(I didn't find this by using FUSE, but in reusing the function for tmpfs.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org [v3.3 only]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a7165a6..b868def 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3392,6 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
 	 * LRU while we overwrite pc->mem_cgroup.
 	 */
+	pc = lookup_page_cgroup(newpage);
 	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
 }
 
-- 
cgit v1.1


From b3dc627cabb33fc95f93da78457770c1b2a364d2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Apr 2012 08:31:34 -0700
Subject: memblock: memblock should be able to handle zero length operations

Commit 24aa07882b ("memblock, x86: Replace memblock_x86_reserve/
free_range() with generic ones") replaced x86 specific memblock
operations with the generic ones; unfortunately, it lost zero length
operation handling in the process making the kernel panic if somebody
tries to reserve zero length area.

There isn't much to be gained by being cranky to zero length operations
and panicking is almost the worst response.  Drop the BUG_ON() in
memblock_reserve() and update memblock_add_region/isolate_range() so
that all zero length operations are handled as noops.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Reported-by: Valere Monseur <valere.monseur@ymail.com>
Bisected-by: Joseph Freeman <jfree143dev@gmail.com>
Tested-by: Joseph Freeman <jfree143dev@gmail.com>
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=43098
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 99f2855..a44eab3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -330,6 +330,9 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
 	phys_addr_t end = base + memblock_cap_size(base, &size);
 	int i, nr_new;
 
+	if (!size)
+		return 0;
+
 	/* special case for empty array */
 	if (type->regions[0].size == 0) {
 		WARN_ON(type->cnt != 1 || type->total_size);
@@ -430,6 +433,9 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 
 	*start_rgn = *end_rgn = 0;
 
+	if (!size)
+		return 0;
+
 	/* we'll create at most two more regions */
 	while (type->cnt + 2 > type->max)
 		if (memblock_double_array(type) < 0)
@@ -514,7 +520,6 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 		     (unsigned long long)base,
 		     (unsigned long long)base + size,
 		     (void *)_RET_IP_);
-	BUG_ON(0 == size);
 
 	return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
 }
-- 
cgit v1.1


From e4eb1ff61b323d6141614e5458a1f53c7046ff8e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 15:35:40 -0700
Subject: VM: add "vm_brk()" helper function

It does the same thing as "do_brk()", except it handles the VM locking
too.

It turns out that all external callers want that anyway, so we can make
do_brk() static to just mm/mmap.c while at it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 16 ++++++++++++++--
 mm/nommu.c |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index a7bf6a3..df51891 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -240,6 +240,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
+static unsigned long do_brk(unsigned long addr, unsigned long len);
+
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long rlim, retval;
@@ -2136,7 +2138,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
@@ -2232,7 +2234,17 @@ out:
 	return addr;
 }
 
-EXPORT_SYMBOL(do_brk);
+unsigned long vm_brk(unsigned long addr, unsigned long len)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long ret;
+
+	down_write(&mm->mmap_sem);
+	ret = do_brk(addr, len);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(vm_brk);
 
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e170..6341933 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1744,7 +1744,7 @@ void exit_mmap(struct mm_struct *mm)
 	kleave("");
 }
 
-unsigned long do_brk(unsigned long addr, unsigned long len)
+unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
 	return -ENOMEM;
 }
-- 
cgit v1.1


From a46ef99d80817a167477ed1c8b4d90ee0c2e726f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 16:20:01 -0700
Subject: VM: add "vm_munmap()" helper function

Like the vm_brk() function, this is the same as "do_munmap()", except it
does the VM locking for the caller.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 15 +++++++++------
 mm/nommu.c |  9 +++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index df51891..4af45f5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2107,21 +2107,24 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 
 	return 0;
 }
-
 EXPORT_SYMBOL(do_munmap);
 
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {
 	int ret;
-	struct mm_struct *mm = current->mm;
-
-	profile_munmap(addr);
 
 	down_write(&mm->mmap_sem);
-	ret = do_munmap(mm, addr, len);
+	ret = do_munmap(mm, start, len);
 	up_write(&mm->mmap_sem);
 	return ret;
 }
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+	profile_munmap(addr);
+	return vm_munmap(current->mm, addr, len);
+}
 
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
diff --git a/mm/nommu.c b/mm/nommu.c
index 6341933..11a69b2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1709,16 +1709,21 @@ erase_whole_vma:
 }
 EXPORT_SYMBOL(do_munmap);
 
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 {
 	int ret;
-	struct mm_struct *mm = current->mm;
 
 	down_write(&mm->mmap_sem);
 	ret = do_munmap(mm, addr, len);
 	up_write(&mm->mmap_sem);
 	return ret;
 }
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+	return vm_munmap(current->mm, addr, len);
+}
 
 /*
  * release all the mappings made in a process's VM space
-- 
cgit v1.1


From 6be5ceb02e98eaf6cfc4f8b12a896d04023f340d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 17:13:58 -0700
Subject: VM: add "vm_mmap()" helper function

This continues the theme started with vm_brk() and vm_munmap():
vm_mmap() does the same thing as do_mmap(), but additionally does the
required VM locking.

This uninlines (and rewrites it to be clearer) do_mmap(), which sadly
duplicates it in mm/mmap.c and mm/nommu.c.  But that way we don't have
to export our internal do_mmap_pgoff() function.

Some day we hopefully don't have to export do_mmap() either, if all
modular users can become the simpler vm_mmap() instead.  We're actually
very close to that already, with the notable exception of the (broken)
use in i810, and a couple of stragglers in binfmt_elf.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c  | 29 +++++++++++++++++++++++++++--
 mm/nommu.c | 29 +++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 4af45f5..b38b47e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -953,7 +953,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
 
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, unsigned long pgoff)
 {
@@ -1089,7 +1089,32 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
 	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long do_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long offset)
+{
+	if (unlikely(offset + PAGE_ALIGN(len) < offset))
+		return -EINVAL;
+	if (unlikely(offset & ~PAGE_MASK))
+		return -EINVAL;
+	return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(do_mmap);
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long offset)
+{
+	unsigned long ret;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+	ret = do_mmap(file, addr, len, prot, flag, offset);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(vm_mmap);
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		unsigned long, prot, unsigned long, flags,
diff --git a/mm/nommu.c b/mm/nommu.c
index 11a69b2..dd00383 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1233,7 +1233,7 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-unsigned long do_mmap_pgoff(struct file *file,
+static unsigned long do_mmap_pgoff(struct file *file,
 			    unsigned long addr,
 			    unsigned long len,
 			    unsigned long prot,
@@ -1470,7 +1470,32 @@ error_getting_region:
 	show_free_areas(0);
 	return -ENOMEM;
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long do_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long offset)
+{
+	if (unlikely(offset + PAGE_ALIGN(len) < offset))
+		return -EINVAL;
+	if (unlikely(offset & ~PAGE_MASK))
+		return -EINVAL;
+	return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(do_mmap);
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flag, unsigned long offset)
+{
+	unsigned long ret;
+	struct mm_struct *mm = current->mm;
+
+	down_write(&mm->mmap_sem);
+	ret = do_mmap(file, addr, len, prot, flag, offset);
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(vm_mmap);
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		unsigned long, prot, unsigned long, flags,
-- 
cgit v1.1


From bfce281c287a427d0841fadf5d59242757b4e620 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Apr 2012 21:57:04 -0400
Subject: kill mm argument of vm_munmap()

it's always current->mm

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/mmap.c  | 5 +++--
 mm/nommu.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index b38b47e..848ef52 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2134,9 +2134,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 }
 EXPORT_SYMBOL(do_munmap);
 
-int vm_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
+	struct mm_struct *mm = current->mm;
 
 	down_write(&mm->mmap_sem);
 	ret = do_munmap(mm, start, len);
@@ -2148,7 +2149,7 @@ EXPORT_SYMBOL(vm_munmap);
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
 	profile_munmap(addr);
-	return vm_munmap(current->mm, addr, len);
+	return vm_munmap(addr, len);
 }
 
 static inline void verify_mm_writelocked(struct mm_struct *mm)
diff --git a/mm/nommu.c b/mm/nommu.c
index dd00383..bb8f4f0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1734,8 +1734,9 @@ erase_whole_vma:
 }
 EXPORT_SYMBOL(do_munmap);
 
-int vm_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+int vm_munmap(unsigned long addr, size_t len)
 {
+	struct mm_struct *mm = current->mm;
 	int ret;
 
 	down_write(&mm->mmap_sem);
@@ -1747,7 +1748,7 @@ EXPORT_SYMBOL(vm_munmap);
 
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
-	return vm_munmap(current->mm, addr, len);
+	return vm_munmap(addr, len);
 }
 
 /*
-- 
cgit v1.1


From aca50bd3b4c4bb5528a1878158ba7abce41de534 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 23 Apr 2012 11:14:50 -0700
Subject: mm: fix s390 BUG by __set_page_dirty_no_writeback on swap

Mel reports a BUG_ON(slot == NULL) in radix_tree_tag_set() on s390
3.0.13: called from __set_page_dirty_nobuffers() when page_remove_rmap()
tries to transfer dirty flag from s390 storage key to struct page and
radix_tree.

That would be because of reclaim's shrink_page_list() calling
add_to_swap() on this page at the same time: first PageSwapCache is set
(causing page_mapping(page) to appear as &swapper_space), then
page->private set, then tree_lock taken, then page inserted into
radix_tree - so there's an interval before taking the lock when the
radix_tree slot is empty.

We could fix this by moving __add_to_swap_cache()'s spin_lock_irq up
before the SetPageSwapCache.  But a better fix is simply to do what's
five years overdue: Ken Chen introduced __set_page_dirty_no_writeback()
(if !PageDirty TestSetPageDirty) for tmpfs to skip all the radix_tree
overhead, and swap is just the same - it ignores the radix_tree tag, and
does not participate in dirty page accounting, so should be using
__set_page_dirty_no_writeback() too.

s390 testing now confirms that this does indeed fix the problem.

Reported-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ken Chen <kenchen@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9d3dd37..4c5ff7f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -26,7 +26,7 @@
  */
 static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.set_page_dirty	= __set_page_dirty_no_writeback,
 	.migratepage	= migrate_page,
 };
 
-- 
cgit v1.1


From 4e1c2b284461fd8aa8d7b295a1e911fc4390755b Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 25 Apr 2012 16:10:50 -0400
Subject: mm: nobootmem: Correct alloc_bootmem semantics.

The comments above __alloc_bootmem_node() claim that the code will
first try the allocation using 'goal' and if that fails it will
try again but with the 'goal' requirement dropped.

Unfortunately, this is not what the code does, so fix it to do so.

This is important for nobootmem conversions to architectures such
as sparc where MAX_DMA_ADDRESS is infinity.

On such architectures all of the allocations done by generic spots,
such as the sparse-vmemmap implementation, will pass in:

	__pa(MAX_DMA_ADDRESS)

as the goal, and with the limit given as "-1" this will always fail
unless we add the appropriate fallback logic here.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nobootmem.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 24f0fc1..e53bb8a 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -298,13 +298,19 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 	if (WARN_ON_ONCE(slab_is_available()))
 		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
 
+again:
 	ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
 					 goal, -1ULL);
 	if (ptr)
 		return ptr;
 
-	return __alloc_memory_core_early(MAX_NUMNODES, size, align,
-					 goal, -1ULL);
+	ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+					goal, -1ULL);
+	if (!ptr && goal) {
+		goal = 0;
+		goto again;
+	}
+	return ptr;
 }
 
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
-- 
cgit v1.1


From ce587e65e8c669eec61df7fb1c515720302e3cc0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 24 Apr 2012 20:22:33 +0200
Subject: mm: memcg: move pc lookup point to commit_charge()

None of the callsites actually need the page_cgroup descriptor
themselves, so just pass the page and do the look up in there.

We already had two bugs (6568d4a 'mm: memcg: update the correct soft
limit tree during migration' and 'memcg: fix Bad page state after
replace_page_cache') where the passed page and pc were not referring
to the same page frame.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b868def..31ab9c3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2476,10 +2476,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 				       struct page *page,
 				       unsigned int nr_pages,
-				       struct page_cgroup *pc,
 				       enum charge_type ctype,
 				       bool lrucare)
 {
+	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct zone *uninitialized_var(zone);
 	bool was_on_lru = false;
 	bool anon;
@@ -2716,7 +2716,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
-	struct page_cgroup *pc;
 	bool oom = true;
 	int ret;
 
@@ -2730,11 +2729,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		oom = false;
 	}
 
-	pc = lookup_page_cgroup(page);
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
 	if (ret == -ENOMEM)
 		return ret;
-	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
+	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
 	return 0;
 }
 
@@ -2831,16 +2829,13 @@ static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 					enum charge_type ctype)
 {
-	struct page_cgroup *pc;
-
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	cgroup_exclude_rmdir(&memcg->css);
 
-	pc = lookup_page_cgroup(page);
-	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
+	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
@@ -3298,14 +3293,13 @@ int mem_cgroup_prepare_migration(struct page *page,
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
-	pc = lookup_page_cgroup(newpage);
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	else if (page_is_file_cache(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
+	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
 	return ret;
 }
 
@@ -3392,8 +3386,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
 	 * LRU while we overwrite pc->mem_cgroup.
 	 */
-	pc = lookup_page_cgroup(newpage);
-	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
+	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 
 #ifdef CONFIG_DEBUG_VM
-- 
cgit v1.1


From b1c12cbcd0a02527c180a862e8971e249d3b347d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 25 Apr 2012 16:01:46 -0700
Subject: mm/hugetlb: fix warning in alloc_huge_page/dequeue_huge_page_vma

Fix a gcc warning (and bug?) introduced in cc9a6c877 ("cpuset: mm: reduce
large amounts of memory barrier related damage v3")

Local variable "page" can be uninitialized if the nodemask from vma policy
does not intersects with nodemask from cpuset.  Even if it doesn't happens
it is better to initialize this variable explicitly than to introduce
a kernel oops in a weird corner case.

mm/hugetlb.c: In function `alloc_huge_page':
mm/hugetlb.c:1135:5: warning: `page' may be used uninitialized in this function

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cd65cb1..5a16423 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
-	struct page *page;
+	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
 	struct zonelist *zonelist;
-- 
cgit v1.1


From 904249aa68010c8e223263c922fcbb840a3f42e4 Mon Sep 17 00:00:00 2001
From: Ying Han <yinghan@google.com>
Date: Wed, 25 Apr 2012 16:01:48 -0700
Subject: mm: fix up the vmscan stat in vmstat

The "pgsteal" stat is confusing because it counts both direct reclaim as
well as background reclaim.  However, we have "kswapd_steal" which also
counts background reclaim value.

This patch fixes it and also makes it match the existng "pgscan_" stats.

Test:
pgsteal_kswapd_dma32 447623
pgsteal_kswapd_normal 42272677
pgsteal_kswapd_movable 0
pgsteal_direct_dma32 2801
pgsteal_direct_normal 44353270
pgsteal_direct_movable 0

Signed-off-by: Ying Han <yinghan@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 11 ++++++++---
 mm/vmstat.c |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a51868..33dc256 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1568,9 +1568,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 	reclaim_stat->recent_scanned[0] += nr_anon;
 	reclaim_stat->recent_scanned[1] += nr_file;
 
-	if (current_is_kswapd())
-		__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-	__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+	if (global_reclaim(sc)) {
+		if (current_is_kswapd())
+			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+					       nr_reclaimed);
+		else
+			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
+					       nr_reclaimed);
+	}
 
 	putback_inactive_pages(mz, &page_list);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f600557..7db1b9b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -738,7 +738,8 @@ const char * const vmstat_text[] = {
 	"pgmajfault",
 
 	TEXTS_FOR_ZONES("pgrefill")
-	TEXTS_FOR_ZONES("pgsteal")
+	TEXTS_FOR_ZONES("pgsteal_kswapd")
+	TEXTS_FOR_ZONES("pgsteal_direct")
 	TEXTS_FOR_ZONES("pgscan_kswapd")
 	TEXTS_FOR_ZONES("pgscan_direct")
 
@@ -747,7 +748,6 @@ const char * const vmstat_text[] = {
 #endif
 	"pginodesteal",
 	"slabs_scanned",
-	"kswapd_steal",
 	"kswapd_inodesteal",
 	"kswapd_low_wmark_hit_quickly",
 	"kswapd_high_wmark_hit_quickly",
-- 
cgit v1.1


From f2a9ef880763d7fbd657a3af646e132a90d70d34 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Wed, 25 Apr 2012 16:01:52 -0700
Subject: mm: fix NULL ptr dereference in migrate_pages

Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has
added an odd construct where 'mm' is checked for being NULL, and if it is,
it would get dereferenced anyways by mput()ing it.

This would lead to the following NULL ptr deref and BUG() when calling
migrate_pages() with a pid that has no mm struct:

[25904.193704] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050
[25904.194235] IP: [<ffffffff810b0de7>] mmput+0x27/0xf0
[25904.194235] PGD 773e6067 PUD 77da0067 PMD 0
[25904.194235] Oops: 0002 [#1] PREEMPT SMP
[25904.194235] CPU 2
[25904.194235] Pid: 31608, comm: trinity Tainted: G        W    3.4.0-rc2-next-20120412-sasha #69
[25904.194235] RIP: 0010:[<ffffffff810b0de7>]  [<ffffffff810b0de7>] mmput+0x27/0xf0
[25904.194235] RSP: 0018:ffff880077d49e08  EFLAGS: 00010202
[25904.194235] RAX: 0000000000000286 RBX: 0000000000000000 RCX: 0000000000000000
[25904.194235] RDX: ffff880075ef8000 RSI: 000000000000023d RDI: 0000000000000286
[25904.194235] RBP: ffff880077d49e18 R08: 0000000000000001 R09: 0000000000000001
[25904.194235] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[25904.194235] R13: 00000000ffffffea R14: ffff880034287740 R15: ffff8800218d3010
[25904.194235] FS:  00007fc8b244c700(0000) GS:ffff880029800000(0000) knlGS:0000000000000000
[25904.194235] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[25904.194235] CR2: 0000000000000050 CR3: 00000000767c6000 CR4: 00000000000406e0
[25904.194235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[25904.194235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[25904.194235] Process trinity (pid: 31608, threadinfo ffff880077d48000, task ffff880075ef8000)
[25904.194235] Stack:
[25904.194235]  ffff8800342876c0 0000000000000000 ffff880077d49f78 ffffffff811b8020
[25904.194235]  ffffffff811b7d91 ffff880075ef8000 ffff88002256d200 0000000000000000
[25904.194235]  00000000000003ff 0000000000000000 0000000000000000 0000000000000000
[25904.194235] Call Trace:
[25904.194235]  [<ffffffff811b8020>] sys_migrate_pages+0x340/0x3a0
[25904.194235]  [<ffffffff811b7d91>] ? sys_migrate_pages+0xb1/0x3a0
[25904.194235]  [<ffffffff8266cbb9>] system_call_fastpath+0x16/0x1b
[25904.194235] Code: c9 c3 66 90 55 31 d2 48 89 e5 be 3d 02 00 00 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 89 fb 48 c7 c7 cf 0e e1 82 e8 69 18 03 00 <f0> ff 4b 50 0f 94 c0 84 c0 0f 84 aa 00 00 00 48 89 df e8 72 f1
[25904.194235] RIP  [<ffffffff810b0de7>] mmput+0x27/0xf0
[25904.194235]  RSP <ffff880077d49e08>
[25904.194235] CR2: 0000000000000050
[25904.348999] ---[ end trace a307b3ed40206b4b ]---

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cfb6c86..b195691 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1361,11 +1361,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 
 	mm = get_task_mm(task);
 	put_task_struct(task);
-	if (mm)
-		err = do_migrate_pages(mm, old, new,
-			capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
-	else
+
+	if (!mm) {
 		err = -EINVAL;
+		goto out;
+	}
+
+	err = do_migrate_pages(mm, old, new,
+		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 
 	mmput(mm);
 out:
-- 
cgit v1.1


From 6e8b09eaf268bceac0c62e389b4bc0cb83dfb8e5 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Wed, 25 Apr 2012 16:01:53 -0700
Subject: mm: fix NULL ptr dereference in move_pages

Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has
added an odd construct where 'mm' is checked for being NULL, and if it is,
it would get dereferenced anyways by mput()ing it.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 51c08a0..1107238 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1388,14 +1388,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
 	mm = get_task_mm(task);
 	put_task_struct(task);
 
-	if (mm) {
-		if (nodes)
-			err = do_pages_move(mm, task_nodes, nr_pages, pages,
-					    nodes, status, flags);
-		else
-			err = do_pages_stat(mm, nr_pages, pages, status);
-	} else
-		err = -EINVAL;
+	if (!mm)
+		return -EINVAL;
+
+	if (nodes)
+		err = do_pages_move(mm, task_nodes, nr_pages, pages,
+				    nodes, status, flags);
+	else
+		err = do_pages_stat(mm, nr_pages, pages, status);
 
 	mmput(mm);
 	return err;
-- 
cgit v1.1


From 42b64281453249dac52861f9b97d18552a7ec62b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 27 Apr 2012 08:42:53 -0700
Subject: percpu: pcpu_embed_first_chunk() should free unused parts after all
 allocs are complete

pcpu_embed_first_chunk() allocates memory for each node, copies percpu
data and frees unused portions of it before proceeding to the next
group.  This assumes that allocations for different nodes doesn't
overlap; however, depending on memory topology, the bootmem allocator
may end up allocating memory from a different node than the requested
one which may overlap with the portion freed from one of the previous
percpu areas.  This leads to percpu groups for different nodes
overlapping which is a serious bug.

This patch separates out copy & partial free from the allocation loop
such that all allocations are complete before partial frees happen.

This also fixes overlapping frees which could happen on allocation
failure path - out_free_areas path frees whole groups but the groups
could have portions freed at that point.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Reported-by: "Pavel V. Panteleev" <pp_84@mail.ru>
Tested-by: "Pavel V. Panteleev" <pp_84@mail.ru>
LKML-Reference: <E1SNhwY-0007ui-V7.pp_84-mail-ru@f220.mail.ru>
---
 mm/percpu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index f921fdf..ac5c626 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1650,6 +1650,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 		areas[group] = ptr;
 
 		base = min(ptr, base);
+	}
+
+	/*
+	 * Copy data and free unused parts.  This should happen after all
+	 * allocations are complete; otherwise, we may end up with
+	 * overlapping groups.
+	 */
+	for (group = 0; group < ai->nr_groups; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+		void *ptr = areas[group];
 
 		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
 			if (gi->cpu_map[i] == NR_CPUS) {
-- 
cgit v1.1


From 100d13c3b5b9410f604b86f5e0a34da64b8cf659 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 9 May 2012 16:55:19 +0100
Subject: kmemleak: Fix the kmemleak tracking of the percpu areas with !SMP

Kmemleak tracks the percpu allocations via a specific API and the
originally allocated areas must be removed from kmemleak (via
kmemleak_free). The code was already doing this for SMP systems.

Reported-by: Sami Liedes <sami.liedes@iki.fi>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index ac5c626..bb4be74 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1895,6 +1895,8 @@ void __init setup_per_cpu_areas(void)
 	fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!ai || !fc)
 		panic("Failed to allocate memory for percpu areas.");
+	/* kmemleak tracks the percpu allocations separately */
+	kmemleak_free(fc);
 
 	ai->dyn_size = unit_size;
 	ai->unit_size = unit_size;
-- 
cgit v1.1


From 93278814d3590eba0ee360b8d69a35c7f2203ea8 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Thu, 10 May 2012 13:01:44 -0700
Subject: mm: fix division by 0 in percpu_pagelist_fraction()

percpu_pagelist_fraction_sysctl_handler() has only considered -EINVAL as
a possible error from proc_dointvec_minmax().

If any other error is returned, it would proceed to divide by zero since
percpu_pagelist_fraction wasn't getting initialized at any point.  For
example, writing 0 bytes into the proc file would trigger the issue.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9..b21b3db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,7 +105,7 @@ unsigned long totalreserve_pages __read_mostly;
  */
 unsigned long dirty_balance_reserve __read_mostly;
 
-int percpu_pagelist_fraction;
+int percpu_pagelist_fraction = 8;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
 #ifdef CONFIG_PM_SLEEP
@@ -5203,7 +5203,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	int ret;
 
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
+	if (!write || (ret < 0))
 		return ret;
 	for_each_populated_zone(zone) {
 		for_each_possible_cpu(cpu) {
-- 
cgit v1.1


From 4998a6c0edce7fae9c0a5463f6ec3fa585258ee7 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 10 May 2012 13:01:44 -0700
Subject: hugetlb: prevent BUG_ON in hugetlb_fault() -> hugetlb_cow()

Commit 66aebce747eaf ("hugetlb: fix race condition in hugetlb_fault()")
added code to avoid a race condition by elevating the page refcount in
hugetlb_fault() while calling hugetlb_cow().

However, one code path in hugetlb_cow() includes an assertion that the
page count is 1, whereas it may now also have the value 2 in this path.

The consensus is that this BUG_ON has served its purpose, so rather than
extending it to cover both cases, we just remove it.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>	[3.0.29+, 3.2.16+, 3.3.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5a16423..ae8f708 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2498,7 +2498,6 @@ retry_avoidcopy:
 		if (outside_reserve) {
 			BUG_ON(huge_pte_none(pte));
 			if (unmap_ref_private(mm, vma, old_page, address)) {
-				BUG_ON(page_count(old_page) != 1);
 				BUG_ON(huge_pte_none(pte));
 				spin_lock(&mm->page_table_lock);
 				ptep = huge_pte_offset(mm, address & huge_page_mask(h));
-- 
cgit v1.1


From 8c7577637ca31385e92769a77e2ab5b428e8b99c Mon Sep 17 00:00:00 2001
From: Sha Zhengju <handai.szj@taobao.com>
Date: Thu, 10 May 2012 13:01:45 -0700
Subject: memcg: free spare array to avoid memory leak

When the last event is unregistered, there is no need to keep the spare
array anymore.  So free it to avoid memory leak.

Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31ab9c3..b659260 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4507,6 +4507,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
+	/* If all events are unregistered, free the spare array */
+	if (!new) {
+		kfree(thresholds->spare);
+		thresholds->spare = NULL;
+	}
+
 	rcu_assign_pointer(thresholds->primary, new);
 
 	/* To be sure that nobody uses thresholds */
-- 
cgit v1.1


From 6bc2e853c6b46a6041980d58200ad9b0a73a60ff Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Thu, 10 May 2012 13:01:46 -0700
Subject: mm: nobootmem: fix sign extend problem in __free_pages_memory()

Systems with 8 TBytes of memory or greater can hit a problem where only
the the first 8 TB of memory shows up.  This is due to "int i" being
smaller than "unsigned long start_aligned", causing the high bits to be
dropped.

The fix is to change `i' to unsigned long to match start_aligned
and end_aligned.

Thanks to Jack Steiner for assistance tracking this down.

Signed-off-by: Russ Anderson <rja@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Gavin Shan <shangw@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nobootmem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e53bb8a..1983fb1 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,8 +82,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-	int i;
-	unsigned long start_aligned, end_aligned;
+	unsigned long i, start_aligned, end_aligned;
 	int order = ilog2(BITS_PER_LONG);
 
 	start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-- 
cgit v1.1


From 1b76b02f15c70d5f392ee2e231fbd20a26063a77 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 11 May 2012 01:00:07 -0700
Subject: mm: raise MemFree by reverting percpu_pagelist_fraction to 0

Why is there less MemFree than there used to be?  It perturbed a test,
so I've just been bisecting linux-next, and now find the offender went
upstream yesterday.

Commit 93278814d359 "mm: fix division by 0 in percpu_pagelist_fraction()"
mistakenly initialized percpu_pagelist_fraction to the sysctl's minimum 8,
which leaves 1/8th of memory on percpu lists (on each cpu??); but most of
us expect it to be left unset at 0 (and it's not then used as a divisor).

  MemTotal: 8061476kB  8061476kB  8061476kB  8061476kB  8061476kB  8061476kB
  Repetitive test with percpu_pagelist_fraction 8:
  MemFree:  6948420kB  6237172kB  6949696kB  6840692kB  6949048kB  6862984kB
  Same test with percpu_pagelist_fraction back to 0:
  MemFree:  7945000kB  7944908kB  7948568kB  7949060kB  7948796kB  7948812kB

Signed-off-by: Hugh Dickins <hughd@google.com>
[ We really should fix the crazy sysctl interface too, but that's a
  separate thing - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b21b3db..918330f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,7 +105,7 @@ unsigned long totalreserve_pages __read_mostly;
  */
 unsigned long dirty_balance_reserve __read_mostly;
 
-int percpu_pagelist_fraction = 8;
+int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
 #ifdef CONFIG_PM_SLEEP
-- 
cgit v1.1


From 02e1a9cd1ea99b67a668f13b61fdf5d42115db0a Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Thu, 17 May 2012 17:03:26 -0700
Subject: slub: missing test for partial pages flush work in flush_all()

I found some kernel messages such as:

    SLUB raid5-md127: kmem_cache_destroy called for cache that still has objects.
    Pid: 6143, comm: mdadm Tainted: G           O 3.4.0-rc6+        #75
    Call Trace:
    kmem_cache_destroy+0x328/0x400
    free_conf+0x2d/0xf0 [raid456]
    stop+0x41/0x60 [raid456]
    md_stop+0x1a/0x60 [md_mod]
    do_md_stop+0x74/0x470 [md_mod]
    md_ioctl+0xff/0x11f0 [md_mod]
    blkdev_ioctl+0xd8/0x7a0
    block_ioctl+0x3b/0x40
    do_vfs_ioctl+0x96/0x560
    sys_ioctl+0x91/0xa0
    system_call_fastpath+0x16/0x1b

Then using kmemleak I found these messages:

    unreferenced object 0xffff8800b6db7380 (size 112):
      comm "mdadm", pid 5783, jiffies 4294810749 (age 90.589s)
      hex dump (first 32 bytes):
        01 01 db b6 ad 4e ad de ff ff ff ff ff ff ff ff  .....N..........
        ff ff ff ff ff ff ff ff 98 40 4a 82 ff ff ff ff  .........@J.....
      backtrace:
        kmemleak_alloc+0x21/0x50
        kmem_cache_alloc+0xeb/0x1b0
        kmem_cache_open+0x2f1/0x430
        kmem_cache_create+0x158/0x320
        setup_conf+0x649/0x770 [raid456]
        run+0x68b/0x840 [raid456]
        md_run+0x529/0x940 [md_mod]
        do_md_run+0x18/0xc0 [md_mod]
        md_ioctl+0xba8/0x11f0 [md_mod]
        blkdev_ioctl+0xd8/0x7a0
        block_ioctl+0x3b/0x40
        do_vfs_ioctl+0x96/0x560
        sys_ioctl+0x91/0xa0
        system_call_fastpath+0x16/0x1b

This bug was introduced by commit a8364d5555b ("slub: only IPI CPUs that
have per cpu obj to flush"), which did not include checks for per cpu
partial pages being present on a cpu.

Signed-off-by: majianpeng <majianpeng@gmail.com>
Cc: Gilad Ben-Yossef <gilad@benyossef.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Tested-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index ffe13fd..80848cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2040,7 +2040,7 @@ static bool has_cpu_slab(int cpu, void *info)
 	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
-	return !!(c->page);
+	return c->page || c->partial;
 }
 
 static void flush_all(struct kmem_cache *s)
-- 
cgit v1.1


From 62ade86ab6c7e26409229ca45503cae97bf698cf Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 18 May 2012 11:28:34 -0700
Subject: memcg,thp: fix res_counter:96 regression

Occasionally, testing memcg's move_charge_at_immigrate on rc7 shows
a flurry of hundreds of warnings at kernel/res_counter.c:96, where
res_counter_uncharge_locked() does WARN_ON(counter->usage < val).

The first trace of each flurry implicates __mem_cgroup_cancel_charge()
of mc.precharge, and an audit of mc.precharge handling points to
mem_cgroup_move_charge_pte_range()'s THP handling in commit 12724850e806
("memcg: avoid THP split in task migration").

Checking !mc.precharge is good everywhere else, when a single page is to
be charged; but here the "mc.precharge -= HPAGE_PMD_NR" likely to
follow, is liable to result in underflow (a lot can change since the
precharge was estimated).

Simply check against HPAGE_PMD_NR: there's probably a better
alternative, trying precharge for more, splitting if unsuccessful; but
this one-liner is safer for now - no kernel/res_counter.c:96 warnings
seen in 26 hours.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b659260..7685d4a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5481,7 +5481,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	 *    part of thp split is not executed yet.
 	 */
 	if (pmd_trans_huge_lock(pmd, vma) == 1) {
-		if (!mc.precharge) {
+		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(&vma->vm_mm->page_table_lock);
 			return 0;
 		}
-- 
cgit v1.1