Merge branch 'android-3.0' into android-omap-3.0

author: Todd Poynor <toddpoynor@google.com> 2013-03-13 17:35:08 -0700
committer: Todd Poynor <toddpoynor@google.com> 2013-03-13 17:35:08 -0700
commit: 1249d25539343d1a7fe361c8d40be7a5df5b0216 (patch)
tree: 770c9a33604bb00c62f44d780aefab27e4eb9379 /mm
parent: 4aad13d07babf68c1d0d37ff1e5f797573c4fd2a (diff)
parent: 0b203ab4aacdb6e6dfb8c277aa290f0a02428e6f (diff)
download: kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.zip
kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.tar.gz
kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.tar.bz2
7 files changed, 140 insertions, 79 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 8ea7308..b4689f81 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -714,14 +714,12 @@ static int compact_node(int nid)
 }
 
 /* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
 {
 	int nid;
 
 	for_each_online_node(nid)
 		compact_node(nid);
-
-	return COMPACT_COMPLETE;
 }
 
 /* The written value is actually unused, all memory is compacted */
@@ -732,7 +730,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	if (write)
-		return compact_nodes();
+		compact_nodes();
 
 	return 0;
 }
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 8d723c9..35b2bb0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
 #include <linux/fadvise.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
+#include <linux/swap.h>
 
 #include <asm/unistd.h>
 
@@ -123,9 +124,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
 		end_index = (endbyte >> PAGE_CACHE_SHIFT);
 
-		if (end_index >= start_index)
-			invalidate_mapping_pages(mapping, start_index,
+		if (end_index >= start_index) {
+			unsigned long count = invalidate_mapping_pages(mapping,
+						start_index, end_index);
+
+			/*
+			 * If fewer pages were invalidated than expected then
+			 * it is possible that some of the pages were on
+			 * a per-cpu pagevec for a remote CPU. Drain all
+			 * pagevecs and try again.
+			 */
+			if (count < (end_index - start_index + 1)) {
+				lru_add_drain_all();
+				invalidate_mapping_pages(mapping, start_index,
 						end_index);
+			}
+		}
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cc11dd..a9ab45e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -920,6 +920,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
+		if (ret & VM_FAULT_OOM)
+			split_huge_page(page);
 		put_page(page);
 		goto out;
 	}
@@ -927,6 +929,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
+		split_huge_page(page);
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
diff --git a/mm/memory.c b/mm/memory.c
index 7292acb..4da0f8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3470,6 +3470,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
+retry:
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
@@ -3483,13 +3484,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
+		int ret;
+
 		barrier();
 		if (pmd_trans_huge(orig_pmd)) {
 			if (flags & FAULT_FLAG_WRITE &&
 			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd))
-				return do_huge_pmd_wp_page(mm, vma, address,
-							   pmd, orig_pmd);
+			    !pmd_trans_splitting(orig_pmd)) {
+				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+							  orig_pmd);
+				/*
+				 * If COW results in an oom, the huge pmd will
+				 * have been split, so retry the fault on the
+				 * pte for a smaller charge.
+				 */
+				if (unlikely(ret & VM_FAULT_OOM))
+					goto retry;
+				return ret;
+			}
 			return 0;
 		}
 	}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 71c7811..88fa54d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/srcu.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
 /*
  * This function can't run concurrently against mmu_notifier_register
  * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,58 +29,61 @@
  * in parallel despite there being no task using this mm any more,
  * through the vmas outside of the exit_mmap context, such as with
  * vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
  * can't go away from under us as exit_mmap holds an mm_count pin
  * itself.
  */
 void __mmu_notifier_release(struct mm_struct *mm)
 {
 	struct mmu_notifier *mn;
-	struct hlist_node *n;
+	int id;
 
 	/*
-	 * RCU here will block mmu_notifier_unregister until
-	 * ->release returns.
+	 * srcu_read_lock() here will block synchronize_srcu() in
+	 * mmu_notifier_unregister() until all registered
+	 * ->release() callouts this function makes have
+	 * returned.
 	 */
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
-		/*
-		 * if ->release runs before mmu_notifier_unregister it
-		 * must be handled as it's the only way for the driver
-		 * to flush all existing sptes and stop the driver
-		 * from establishing any more sptes before all the
-		 * pages in the mm are freed.
-		 */
-		if (mn->ops->release)
-			mn->ops->release(mn, mm);
-	rcu_read_unlock();
-
+	id = srcu_read_lock(&srcu);
 	spin_lock(&mm->mmu_notifier_mm->lock);
 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
 		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
 				 struct mmu_notifier,
 				 hlist);
+
 		/*
-		 * We arrived before mmu_notifier_unregister so
-		 * mmu_notifier_unregister will do nothing other than
-		 * to wait ->release to finish and
-		 * mmu_notifier_unregister to return.
+		 * Unlink.  This will prevent mmu_notifier_unregister()
+		 * from also making the ->release() callout.
 		 */
 		hlist_del_init_rcu(&mn->hlist);
+		spin_unlock(&mm->mmu_notifier_mm->lock);
+
+		/*
+		 * Clear sptes. (see 'release' description in mmu_notifier.h)
+		 */
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+
+		spin_lock(&mm->mmu_notifier_mm->lock);
 	}
 	spin_unlock(&mm->mmu_notifier_mm->lock);
 
 	/*
-	 * synchronize_rcu here prevents mmu_notifier_release to
-	 * return to exit_mmap (which would proceed freeing all pages
-	 * in the mm) until the ->release method returns, if it was
-	 * invoked by mmu_notifier_unregister.
-	 *
-	 * The mmu_notifier_mm can't go away from under us because one
-	 * mm_count is hold by exit_mmap.
+	 * All callouts to ->release() which we have done are complete.
+	 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
+	 */
+	srcu_read_unlock(&srcu, id);
+
+	/*
+	 * mmu_notifier_unregister() may have unlinked a notifier and may
+	 * still be calling out to it.	Additionally, other notifiers
+	 * may have been active via vmtruncate() et. al. Block here
+	 * to ensure that all notifier callouts for this mm have been
+	 * completed and the sptes are really cleaned up before returning
+	 * to exit_mmap().
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 }
 
 /*
@@ -89,14 +96,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->clear_flush_young)
 			young |= mn->ops->clear_flush_young(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -106,9 +113,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
-	int young = 0;
+	int young = 0, id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->test_young) {
 			young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +123,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 				break;
 		}
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 
 	return young;
 }
@@ -126,8 +133,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->change_pte)
 			mn->ops->change_pte(mn, mm, address, pte);
@@ -138,7 +146,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 		else if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +154,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_page)
 			mn->ops->invalidate_page(mn, mm, address);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +169,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_start)
 			mn->ops->invalidate_range_start(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +184,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 {
 	struct mmu_notifier *mn;
 	struct hlist_node *n;
+	int id;
 
-	rcu_read_lock();
+	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
 		if (mn->ops->invalidate_range_end)
 			mn->ops->invalidate_range_end(mn, mm, start, end);
 	}
-	rcu_read_unlock();
+	srcu_read_unlock(&srcu, id);
 }
 
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +203,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
 
 	BUG_ON(atomic_read(&mm->mm_users) <= 0);
 
+	/*
+	* Verify that mmu_notifier_init() already run and the global srcu is
+	* initialized.
+	*/
+	BUG_ON(!srcu.per_cpu_ref);
+
 	ret = -ENOMEM;
 	mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
 	if (unlikely(!mmu_notifier_mm))
@@ -274,8 +291,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
 /*
  * This releases the mm_count pin automatically and frees the mm
  * structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
  * calling mmu_notifier_unregister. ->release or any other notifier
  * method may be invoked concurrently with mmu_notifier_unregister,
  * and only after mmu_notifier_unregister returned we're guaranteed
@@ -285,35 +302,43 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
+	spin_lock(&mm->mmu_notifier_mm->lock);
 	if (!hlist_unhashed(&mn->hlist)) {
-		/*
-		 * RCU here will force exit_mmap to wait ->release to finish
-		 * before freeing the pages.
-		 */
-		rcu_read_lock();
+		int id;
 
 		/*
-		 * exit_mmap will block in mmu_notifier_release to
-		 * guarantee ->release is called before freeing the
-		 * pages.
+		 * Ensure we synchronize up with __mmu_notifier_release().
 		 */
+		id = srcu_read_lock(&srcu);
+
+		hlist_del_rcu(&mn->hlist);
+		spin_unlock(&mm->mmu_notifier_mm->lock);
+
 		if (mn->ops->release)
 			mn->ops->release(mn, mm);
-		rcu_read_unlock();
 
-		spin_lock(&mm->mmu_notifier_mm->lock);
-		hlist_del_rcu(&mn->hlist);
+		/*
+		 * Allow __mmu_notifier_release() to complete.
+		 */
+		srcu_read_unlock(&srcu, id);
+	} else
 		spin_unlock(&mm->mmu_notifier_mm->lock);
-	}
 
 	/*
-	 * Wait any running method to finish, of course including
-	 * ->release if it was run by mmu_notifier_relase instead of us.
+	 * Wait for any running method to finish, including ->release() if it
+	 * was run by __mmu_notifier_release() instead of us.
 	 */
-	synchronize_rcu();
+	synchronize_srcu(&srcu);
 
 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
 
 	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+	return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bfe7894..aed2f55 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4287,10 +4287,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
-static unsigned long __init usemap_size(unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 
+	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4300,17 +4301,19 @@ static unsigned long __init usemap_size(unsigned long zonesize)
 }
 
 static void __init setup_usemap(struct pglist_data *pgdat,
-				struct zone *zone, unsigned long zonesize)
+				struct zone *zone,
+				unsigned long zone_start_pfn,
+				unsigned long zonesize)
 {
-	unsigned long usemapsize = usemap_size(zonesize);
+	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
-static inline void setup_usemap(struct pglist_data *pgdat,
-				struct zone *zone, unsigned long zonesize) {}
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -4438,7 +4441,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 			continue;
 
 		set_pageblock_order(pageblock_default_order());
-		setup_usemap(pgdat, zone, size);
+		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
@@ -5515,7 +5518,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
-	pfn = pfn - zone->zone_start_pfn;
+	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 492584c..bcfa97d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2505,6 +2505,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	unsigned long inodes;
 	int error = -EINVAL;
 
+	config.mpol = NULL;
 	if (shmem_parse_options(data, &config, true))
 		return error;
 
@@ -2530,8 +2531,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
 
-	mpol_put(sbinfo->mpol);
-	sbinfo->mpol        = config.mpol;	/* transfers initial ref */
+	/*
+	 * Preserve previous mempolicy unless mpol remount option was specified.
+	 */
+	if (config.mpol) {
+		mpol_put(sbinfo->mpol);
+		sbinfo->mpol = config.mpol;	/* transfers initial ref */
+	}
 out:
 	spin_unlock(&sbinfo->stat_lock);
 	return error;
author	Todd Poynor <toddpoynor@google.com>	2013-03-13 17:35:08 -0700
committer	Todd Poynor <toddpoynor@google.com>	2013-03-13 17:35:08 -0700
commit	1249d25539343d1a7fe361c8d40be7a5df5b0216 (patch)
tree	770c9a33604bb00c62f44d780aefab27e4eb9379 /mm
parent	4aad13d07babf68c1d0d37ff1e5f797573c4fd2a (diff)
parent	0b203ab4aacdb6e6dfb8c277aa290f0a02428e6f (diff)
download	kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.zip kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.tar.gz kernel_samsung_tuna-1249d25539343d1a7fe361c8d40be7a5df5b0216.tar.bz2