aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/swap.h11
-rw-r--r--include/linux/topology.h8
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/vmscan.c68
5 files changed, 108 insertions, 8 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 34cbefd..93a849f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,15 +149,17 @@ struct zone {
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
- /*
- * Does the allocator try to reclaim pages from the zone as soon
- * as it fails a watermark_ok() in __alloc_pages?
- */
- int reclaim_pages;
/* A count of how many reclaimers are scanning this zone */
atomic_t reclaim_in_progress;
/*
+ * timestamp (in jiffies) of the last zone reclaim that did not
+ * result in freeing of pages. This is used to avoid repeated scans
+ * if all memory in the zone is in use.
+ */
+ unsigned long last_unsuccessful_zone_reclaim;
+
+ /*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d01f7ef..4a99e4a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
extern int shrink_all_memory(int);
extern int vm_swappiness;
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+#else
+#define zone_reclaim_mode 0
+static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_MIGRATION
extern int isolate_lru_page(struct page *p);
extern int putback_lru_pages(struct list_head *l);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 315a516..e8eb004 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -56,6 +56,14 @@
#define REMOTE_DISTANCE 20
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
+#ifndef RECLAIM_DISTANCE
+/*
+ * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
+ * (in whatever arch specific measurement units returned by node_distance())
+ * then switch on zone reclaim on boot.
+ */
+#define RECLAIM_DISTANCE 20
+#endif
#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e2974..df54e2f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
mark = (*z)->pages_high;
if (!zone_watermark_ok(*z, order, mark,
classzone_idx, alloc_flags))
- continue;
+ if (!zone_reclaim_mode ||
+ !zone_reclaim(*z, gfp_mask, order))
+ continue;
}
page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
prev_node = local_node;
nodes_clear(used_mask);
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+
+ /*
+ * If another node is sufficiently far away then it is better
+ * to reclaim pages in a zone before going off node.
+ */
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
+
+ if (distance != node_distance(local_node, prev_node))
node_load[node] += load;
prev_node = node;
load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6..2e34b61 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ int nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = 0,
+ .may_swap = 0,
+ .nr_mapped = read_page_state(nr_mapped),
+ .nr_scanned = 0,
+ .nr_reclaimed = 0,
+ .priority = 0
+ };
+
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->zone_pgdat->node_id != numa_node_id() ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0)
+ return 0;
+
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+ return 0;
+
+ disable_swap_token();
+
+ if (nr_pages > SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ cond_resched();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+ shrink_zone(zone, &sc);
+ p->reclaim_state = NULL;
+ current->flags &= ~PF_MEMALLOC;
+
+ if (sc.nr_reclaimed == 0)
+ zone->last_unsuccessful_zone_reclaim = jiffies;
+
+ return sc.nr_reclaimed > nr_pages;
+}
+#endif
+