aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohn W. Linville <linville@tuxdriver.com>2006-02-27 20:12:23 -0500
committerJohn W. Linville <linville@tuxdriver.com>2006-02-27 20:12:23 -0500
commit9f5a405b6843933c1cae5826046a5dd6357f142a (patch)
treebcdbb0175d37b780551b71c40abcde964a8905eb /mm
parentb7cffb028abbffff3ba0b87268ecb775ed354049 (diff)
parent051d3cbd96909b2fe6b5038e7bbe77f41356db05 (diff)
downloadkernel_samsung_smdk4412-9f5a405b6843933c1cae5826046a5dd6357f142a.zip
kernel_samsung_smdk4412-9f5a405b6843933c1cae5826046a5dd6357f142a.tar.gz
kernel_samsung_smdk4412-9f5a405b6843933c1cae5826046a5dd6357f142a.tar.bz2
Merge branch 'from-linus'
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c10
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c123
-rw-r--r--mm/page_alloc.c24
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/vmscan.c10
7 files changed, 226 insertions, 68 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f2..9abc600 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+ randomize_va_space = 0;
+ return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7..67af4ce 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
+
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd;
+ int num, max, nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes)
- zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+ /* First put in the highest zones from all nodes, then all the next
+ lower zones etc. Avoid empty zones because the memory allocator
+ doesn't like them. If you implement node hot removal you
+ have to fix that. */
+ for (k = policy_zone; k >= 0; k--) {
+ for_each_node_mask(nd, *nodes) {
+ struct zone *z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ zl->zones[num++] = z;
+ }
+ }
zl->zones[num] = NULL;
return zl;
}
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (isolate_lru_page(page))
- list_add(&page->lru, pagelist);
+ list_add_tail(&page->lru, pagelist);
}
}
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
LIST_HEAD(moved);
LIST_HEAD(failed);
int err = 0;
+ unsigned long offset = 0;
int nr_pages;
struct page *page;
struct list_head *p;
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
redo:
nr_pages = 0;
list_for_each(p, pagelist) {
- if (vma)
- page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
+ }
else
page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
@@ -575,9 +599,9 @@ redo:
err = -ENOMEM;
goto out;
}
- list_add(&page->lru, &newlist);
+ list_add_tail(&page->lru, &newlist);
nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE);
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
break;
}
err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d..99d2102 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
/*
* Handle all mappings that got truncated by a "truncate()"
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f..8123fad 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/*
* Processes which fork a lot of child processes are likely
- * a good choice. We add the vmsize of the children if they
+ * a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children
+ * machine with an endless amount of children. In case a single
+ * child is eating the vast majority of memory, adding only half
+ * to the parents will make the child our kill candidate of choice.
*/
list_for_each(tsk, &p->children) {
struct task_struct *chld;
chld = list_entry(tsk, struct task_struct, sibling);
if (chld->mm != p->mm && chld->mm)
- points += chld->mm->total_vm;
+ points += chld->mm->total_vm/2 + 1;
}
/*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+ struct zone **z;
+ nodemask_t nodes = node_online_map;
+
+ for (z = zonelist->zones; *z; z++)
+ if (cpuset_zone_allowed(*z, gfp_mask))
+ node_clear((*z)->zone_pgdat->node_id,
+ nodes);
+ else
+ return CONSTRAINT_CPUSET;
+
+ if (!nodes_empty(nodes))
+ return CONSTRAINT_MEMORY_POLICY;
+#endif
+
+ return CONSTRAINT_NONE;
+}
+
+/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
{
- unsigned long maxpoints = 0;
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+ *ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
return p;
points = badness(p, uptime.tv_sec);
- if (points > maxpoints || !chosen) {
+ if (points > *ppoints || !chosen) {
chosen = p;
- maxpoints = points;
+ *ppoints = points;
}
} while_each_thread(g, p);
return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
* we select a process with CAP_SYS_RAW_IO set).
*/
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
{
if (p->pid == 1) {
WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
return;
}
task_unlock(p);
- printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
- p->pid, p->comm);
+ printk(KERN_ERR "%s: Killed process %d (%s).\n",
+ message, p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
force_sig(SIGKILL, p);
}
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
{
struct mm_struct *mm = get_task_mm(p);
task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
return NULL;
}
- __oom_kill_task(p);
+ __oom_kill_task(p, message);
/*
* kill all processes that share the ->mm (i.e. all threads),
* but are in a different thread group
*/
do_each_thread(g, q)
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
+ __oom_kill_task(q, message);
while_each_thread(g, q);
return mm;
}
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+ unsigned long points, const char *message)
{
struct mm_struct *mm;
struct task_struct *c;
struct list_head *tsk;
+ printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+ "children.\n", p->pid, p->comm, points);
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- mm = oom_kill_task(c);
+ mm = oom_kill_task(c, message);
if (mm)
return mm;
}
- return oom_kill_task(p);
+ return oom_kill_task(p, message);
}
/**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
struct mm_struct *mm = NULL;
- task_t * p;
+ task_t *p;
+ unsigned long points;
if (printk_ratelimit()) {
printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
cpuset_lock();
read_lock(&tasklist_lock);
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA) that may require different handling.
+ */
+ switch (constrained_alloc(zonelist, gfp_mask)) {
+ case CONSTRAINT_MEMORY_POLICY:
+ mm = oom_kill_process(current, points,
+ "No available memory (MPOL_BIND)");
+ break;
+
+ case CONSTRAINT_CPUSET:
+ mm = oom_kill_process(current, points,
+ "No available memory in cpuset");
+ break;
+
+ case CONSTRAINT_NONE:
retry:
- p = select_bad_process();
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points);
- if (PTR_ERR(p) == -1UL)
- goto out;
+ if (PTR_ERR(p) == -1UL)
+ goto out;
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- cpuset_unlock();
- panic("Out of memory and no killable processes...\n");
- }
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ cpuset_unlock();
+ panic("Out of memory and no killable processes...\n");
+ }
- mm = oom_kill_process(p);
- if (!mm)
- goto retry;
+ mm = oom_kill_process(p, points, "Out of memory");
+ if (!mm)
+ goto retry;
+
+ break;
+ }
- out:
- read_unlock(&tasklist_lock);
+out:
cpuset_unlock();
if (mm)
mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62c1225..791690d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1015,7 +1015,7 @@ rebalance:
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
@@ -1541,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
*/
static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
{
- int i, n, val;
+ int n, val;
int min_val = INT_MAX;
int best_node = -1;
- for_each_online_node(i) {
- cpumask_t tmp;
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
- /* Start from local node */
- n = (node+i) % num_online_nodes();
+ for_each_online_node(n) {
+ cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
- best_node = node;
- break;
- }
-
/* Use the distance array to find the distance */
val = node_distance(node, n);
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
/* Give preference to headless and unused nodes */
tmp = node_to_cpumask(n);
if (!cpus_empty(tmp))
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b8..7c455fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
}
#ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ char *nodelist = strchr(value, ':');
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate policy string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, *policy_nodes))
+ goto out;
+ }
+ if (!strcmp(value, "default")) {
+ *policy = MPOL_DEFAULT;
+ /* Don't allow a nodelist */
+ if (!nodelist)
+ err = 0;
+ } else if (!strcmp(value, "prefer")) {
+ *policy = MPOL_PREFERRED;
+ /* Insist on a nodelist of one node only */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (!*rest)
+ err = 0;
+ }
+ } else if (!strcmp(value, "bind")) {
+ *policy = MPOL_BIND;
+ /* Insist on a nodelist */
+ if (nodelist)
+ err = 0;
+ } else if (!strcmp(value, "interleave")) {
+ *policy = MPOL_INTERLEAVE;
+ /* Default to nodes online if no nodelist */
+ if (!nodelist)
+ *policy_nodes = node_online_map;
+ err = 0;
+ }
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ return err;
+}
+
static struct page *shmem_swapin_async(struct shared_policy *p,
swp_entry_t entry, unsigned long idx)
{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
return page;
}
#else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ return 1;
+}
+
static inline struct page *
shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
{
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
{
char *this_char, *value, *rest;
- while ((this_char = strsep(&options, ",")) != NULL) {
+ while (options != NULL) {
+ this_char = options;
+ for (;;) {
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ options = strchr(options, ',');
+ if (options == NULL)
+ break;
+ options++;
+ if (!isdigit(*options)) {
+ options[-1] = '\0';
+ break;
+ }
+ }
if (!*this_char)
continue;
if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (!strcmp(value,"default"))
- *policy = MPOL_DEFAULT;
- else if (!strcmp(value,"preferred"))
- *policy = MPOL_PREFERRED;
- else if (!strcmp(value,"bind"))
- *policy = MPOL_BIND;
- else if (!strcmp(value,"interleave"))
- *policy = MPOL_INTERLEAVE;
- else
+ if (shmem_parse_mpol(value,policy,policy_nodes))
goto bad_val;
- } else if (!strcmp(this_char,"mpol_nodelist")) {
- nodelist_parse(value, *policy_nodes);
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1838c15..b0af759 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1908,7 +1908,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
cond_resched();
- p->flags |= PF_MEMALLOC;
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -1932,11 +1937,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* a long time.
*/
shrink_slab(sc.nr_scanned, gfp_mask, order);
- sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
}
p->reclaim_state = NULL;
- current->flags &= ~PF_MEMALLOC;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
if (sc.nr_reclaimed == 0)
zone->last_unsuccessful_zone_reclaim = jiffies;