aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c47
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/fremap.c26
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c107
-rw-r--r--mm/page-writeback.c300
-rw-r--r--mm/page_alloc.c23
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c30
-rw-r--r--mm/swap.c5
-rw-r--r--mm/tiny-shmem.c19
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c40
-rw-r--r--mm/vmstat.c2
19 files changed, 484 insertions, 187 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a281..b0ceb29 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -5,6 +5,41 @@
#include <linux/sched.h>
#include <linux/module.h>
+int bdi_init(struct backing_dev_info *bdi)
+{
+ int i, j;
+ int err;
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+ err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ if (err)
+ goto err;
+ }
+
+ bdi->dirty_exceeded = 0;
+ err = prop_local_init_percpu(&bdi->completions);
+
+ if (err) {
+err:
+ for (j = 0; j < i; j++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ int i;
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+ prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout)
}
EXPORT_SYMBOL(congestion_wait);
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
- wait_queue_head_t *wqh = &congestion_wqh[rw];
-
- if (waitqueue_active(wqh))
- wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);
diff --git a/mm/filemap.c b/mm/filemap.c
index c6049e9..79f24a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
+ * ->zone.lock
*
* ->i_mutex
* ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -1626,12 +1627,18 @@ int __remove_suid(struct dentry *dentry, int kill)
int remove_suid(struct dentry *dentry)
{
- int kill = should_remove_suid(dentry);
+ int killsuid = should_remove_suid(dentry);
+ int killpriv = security_inode_need_killpriv(dentry);
+ int error = 0;
- if (unlikely(kill))
- return __remove_suid(dentry, kill);
+ if (killpriv < 0)
+ return killpriv;
+ if (killpriv)
+ error = security_inode_killpriv(dentry);
+ if (!error && killsuid)
+ error = __remove_suid(dentry, killsuid);
- return 0;
+ return error;
}
EXPORT_SYMBOL(remove_suid);
diff --git a/mm/fremap.c b/mm/fremap.c
index 95bcb56..14bd3bf 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,7 @@
*
* started by Ingo Molnar, Copyright (C) 2002, 2003
*/
-
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
}
-/***
- * sys_remap_file_pages - remap arbitrary pages of a shared backing store
- * file within an existing vma.
+/**
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
* @start: start of the remapped virtual memory range
* @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range
- * @pgoff: to be mapped page of the backing store file
+ * @prot: new protection bits of the range (see NOTE)
+ * @pgoff: to-be-mapped page of the backing store file
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
*
- * this syscall works purely via pagetables, so it's the most efficient
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
* way to map the same (large) file into a given virtual window. Unlike
* mmap()/mremap() it does not create any new vmas. The new mappings are
* also safe across swapout.
*
- * NOTE: the 'prot' parameter right now is ignored, and the vma's default
- * protection is used. Arbitrary protections might be implemented in the
- * future.
+ * NOTE: the 'prot' parameter right now is ignored (but must be zero),
+ * and the vma's default protection is used. Arbitrary protections
+ * might be implemented in the future.
*/
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
- unsigned long __prot, unsigned long pgoff, unsigned long flags)
+ unsigned long prot, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
int err = -EINVAL;
int has_write_lock = 0;
- if (__prot)
+ if (prot)
return err;
/*
* Sanitize the syscall parameters:
diff --git a/mm/mmap.c b/mm/mmap.c
index 0d40e66..4275e81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -7,6 +7,7 @@
*/
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/mman.h>
@@ -180,8 +181,6 @@ error:
return -ENOMEM;
}
-EXPORT_SYMBOL(__vm_enough_memory);
-
/*
* Requires inode->i_mapping->i_mmap_lock
*/
diff --git a/mm/nommu.c b/mm/nommu.c
index 8ed0cb4..42fb84e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int heap_stack_gap = 0;
EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(__vm_enough_memory);
EXPORT_SYMBOL(num_physpages);
/* list of shareable VMAs */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 41b4e36..a64decb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,8 @@
#include <linux/notifier.h>
int sysctl_panic_on_oom;
+int sysctl_oom_kill_allocating_task;
+static DEFINE_SPINLOCK(zone_scan_mutex);
/* #define DEBUG */
/**
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* because p may have allocated or otherwise mapped memory on
* this node before. However it will be less likely.
*/
- if (!cpuset_excl_nodes_overlap(p))
+ if (!cpuset_mems_allowed_intersects(current, p))
points /= 8;
/*
@@ -164,16 +166,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
- * Types of limitations to the nodes from which allocations may occur
- */
-#define CONSTRAINT_NONE 1
-#define CONSTRAINT_MEMORY_POLICY 2
-#define CONSTRAINT_CPUSET 3
-
-/*
* Determine the type of allocation constraint.
*/
-static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
struct zone **z;
@@ -337,12 +333,20 @@ static int oom_kill_task(struct task_struct *p)
return 0;
}
-static int oom_kill_process(struct task_struct *p, unsigned long points,
- const char *message)
+static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned long points, const char *message)
{
struct task_struct *c;
struct list_head *tsk;
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "%s invoked oom-killer: "
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order, current->oomkilladj);
+ dump_stack();
+ show_mem();
+ }
+
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
@@ -380,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
+ */
+int try_set_zone_oom(struct zonelist *zonelist)
+{
+ struct zone **z;
+ int ret = 1;
+
+ z = zonelist->zones;
+
+ spin_lock(&zone_scan_mutex);
+ do {
+ if (zone_is_oom_locked(*z)) {
+ ret = 0;
+ goto out;
+ }
+ } while (*(++z) != NULL);
+
+ /*
+ * Lock each zone in the zonelist under zone_scan_mutex so a parallel
+ * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
+ */
+ z = zonelist->zones;
+ do {
+ zone_set_flag(*z, ZONE_OOM_LOCKED);
+ } while (*(++z) != NULL);
+out:
+ spin_unlock(&zone_scan_mutex);
+ return ret;
+}
+
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void clear_zonelist_oom(struct zonelist *zonelist)
+{
+ struct zone **z;
+
+ z = zonelist->zones;
+
+ spin_lock(&zone_scan_mutex);
+ do {
+ zone_clear_flag(*z, ZONE_OOM_LOCKED);
+ } while (*(++z) != NULL);
+ spin_unlock(&zone_scan_mutex);
+}
+
/**
* out_of_memory - kill the "best" process when we run out of memory
*
@@ -393,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
struct task_struct *p;
unsigned long points = 0;
unsigned long freed = 0;
- int constraint;
+ enum oom_constraint constraint;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
- if (printk_ratelimit()) {
- printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
- current->comm, gfp_mask, order, current->oomkilladj);
- dump_stack();
- show_mem();
- }
-
if (sysctl_panic_on_oom == 2)
panic("out of memory. Compulsory panic_on_oom is selected.\n");
@@ -416,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
* NUMA) that may require different handling.
*/
constraint = constrained_alloc(zonelist, gfp_mask);
- cpuset_lock();
read_lock(&tasklist_lock);
switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, points,
+ oom_kill_process(current, gfp_mask, order, points,
"No available memory (MPOL_BIND)");
break;
- case CONSTRAINT_CPUSET:
- oom_kill_process(current, points,
- "No available memory in cpuset");
- break;
-
case CONSTRAINT_NONE:
if (sysctl_panic_on_oom)
panic("out of memory. panic_on_oom is selected\n");
+ /* Fall-through */
+ case CONSTRAINT_CPUSET:
+ if (sysctl_oom_kill_allocating_task) {
+ oom_kill_process(current, gfp_mask, order, points,
+ "Out of memory (oom_kill_allocating_task)");
+ break;
+ }
retry:
/*
* Rambo mode: Shoot down a process and hope it solves whatever
@@ -446,11 +494,11 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
- cpuset_unlock();
panic("Out of memory and no killable processes...\n");
}
- if (oom_kill_process(p, points, "Out of memory"))
+ if (oom_kill_process(p, points, gfp_mask, order,
+ "Out of memory"))
goto retry;
break;
@@ -458,7 +506,6 @@ retry:
out:
read_unlock(&tasklist_lock);
- cpuset_unlock();
/*
* Give "p" a good chance of killing itself before we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d821321..7845462 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Contains functions related to writing back dirty pages at the
* address_space level.
@@ -36,7 +37,7 @@
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_LOCK against an inode for
+ * operation. We do this so we don't hold I_SYNC against an inode for
* enormous amounts of time, which would block a userspace task which has
* been forced to throttle against that inode. Also, the code reevaluates
* the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
*/
static long ratelimit_pages = 32;
-static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
-
/*
* When balance_dirty_pages decides that the caller needs to perform some
* non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
static void background_writeout(unsigned long _min_pages);
/*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+
+static unsigned long determine_dirtyable_memory(void);
+
+/*
+ * couple the period to the dirty_ratio:
+ *
+ * period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+ unsigned long dirty_total;
+
+ dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ return 2 + ilog2(dirty_total - 1);
+}
+
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_ratio = vm_dirty_ratio;
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ int shift = calc_period_shift();
+ prop_change_shift(&vm_completions, shift);
+ prop_change_shift(&vm_dirties, shift);
+ }
+ return ret;
+}
+
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+ prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+ long *numerator, long *denominator)
+{
+ if (bdi_cap_writeback_dirty(bdi)) {
+ prop_fraction_percpu(&vm_completions, &bdi->completions,
+ numerator, denominator);
+ } else {
+ *numerator = 0;
+ *denominator = 1;
+ }
+}
+
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+ long avail_dirty;
+
+ avail_dirty = dirty -
+ (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_WRITEBACK) +
+ global_page_state(NR_UNSTABLE_NFS));
+
+ if (avail_dirty < 0)
+ avail_dirty = 0;
+
+ avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+ bdi_stat(bdi, BDI_WRITEBACK);
+
+ *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+
+static inline void task_dirties_fraction(struct task_struct *tsk,
+ long *numerator, long *denominator)
+{
+ prop_fraction_single(&vm_dirties, &tsk->dirties,
+ numerator, denominator);
+}
+
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ * dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+ long numerator, denominator;
+ long dirty = *pdirty;
+ u64 inv = dirty >> 3;
+
+ task_dirties_fraction(tsk, &numerator, &denominator);
+ inv *= numerator;
+ do_div(inv, denominator);
+
+ dirty -= inv;
+ if (dirty < *pdirty/2)
+ dirty = *pdirty/2;
+
+ *pdirty = dirty;
+}
+
+/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
*
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
}
static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi)
{
int background_ratio; /* Percentages */
int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+
+ if (bdi) {
+ u64 bdi_dirty = dirty;
+ long numerator, denominator;
+
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ *pbdi_dirty = bdi_dirty;
+ clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+ task_dirty_limit(current, pbdi_dirty);
+ }
}
/*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
*/
static void balance_dirty_pages(struct address_space *mapping)
{
- long nr_reclaimable;
+ long bdi_nr_reclaimable;
+ long bdi_nr_writeback;
long background_thresh;
long dirty_thresh;
+ long bdi_thresh;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
- break;
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+ bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ break;
- if (!dirty_exceeded)
- dirty_exceeded = 1;
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
- if (nr_reclaimable) {
+ if (bdi_nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
- break;
pages_written += write_chunk - wbc.nr_to_write;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ get_dirty_limits(&background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
+ }
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+ bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else if (bdi_nr_reclaimable) {
+ bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
+
+ if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ break;
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+
congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
- dirty_exceeded = 0;
+ if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+ bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
+ (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+ + global_page_state(NR_UNSTABLE_NFS)
+ > background_thresh)))
pdflush_operation(background_writeout, 0);
}
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long *p;
ratelimit = ratelimit_pages;
- if (dirty_exceeded)
+ if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;
/*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
long background_thresh;
long dirty_thresh;
- if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
- /*
- * The caller might hold locks which can prevent IO completion
- * or progress in the filesystem. So we cannot just sit here
- * waiting for IO to complete.
- */
- congestion_wait(WRITE, HZ/10);
- return;
- }
-
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
/*
* Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
congestion_wait(WRITE, HZ/10);
+
+ /*
+ * The caller might hold locks which can prevent IO completion
+ * or progress in the filesystem. So we cannot just sit here
+ * waiting for IO to complete.
+ */
+ if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
+ break;
}
}
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
long background_thresh;
long dirty_thresh;
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
if (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
+ wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
- congestion_wait(WRITE, HZ/10);
- if (!wbc.encountered_congestion)
+ if (wbc.encountered_congestion || wbc.more_io)
+ congestion_wait(WRITE, HZ/10);
+ else
break;
}
}
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
+ wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion)
+ if (wbc.encountered_congestion || wbc.more_io)
congestion_wait(WRITE, HZ/10);
else
break; /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
+ int shift;
+
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
+
+ shift = calc_period_shift();
+ prop_descriptor_init(&vm_completions, shift);
+ prop_descriptor_init(&vm_dirties, shift);
}
/**
@@ -672,8 +850,10 @@ retry:
ret = (*writepage)(page, wbc, data);
- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+ if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
unlock_page(page);
+ ret = 0;
+ }
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
task_io_account_write(PAGE_CACHE_SIZE);
}
radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
}
return 0;
}
+
+int fastcall set_page_dirty(struct page *page)
+{
+ int ret = __set_page_dirty(page);
+ if (ret)
+ task_dirty_inc(current);
+ return ret;
+}
EXPORT_SYMBOL(set_page_dirty);
/*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
return 1;
}
return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
int ret;
if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
- if (ret)
+ if (ret) {
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_writeback_dirty(bdi)) {
+ __dec_bdi_stat(bdi, BDI_WRITEBACK);
+ __bdi_writeout_inc(bdi);
+ }
+ }
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
int ret;
if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
- if (!ret)
+ if (!ret) {
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_writeback_dirty(bdi))
+ __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ }
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d315e11..43f757f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/oom.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
@@ -489,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count,
struct list_head *list, int order)
{
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
+ zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
while (count--) {
struct page *page;
@@ -506,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count,
static void free_one_page(struct zone *zone, struct page *page, int order)
{
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
+ zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
__free_one_page(page, zone, order);
spin_unlock(&zone->lock);
@@ -1586,6 +1587,11 @@ nofail_alloc:
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ if (!try_set_zone_oom(zonelist)) {
+ schedule_timeout_uninterruptible(1);
+ goto restart;
+ }
+
/*
* Go through the zonelist yet one more time, keep
* very high watermark here, this is only to catch
@@ -1594,14 +1600,19 @@ nofail_alloc:
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
- if (page)
+ if (page) {
+ clear_zonelist_oom(zonelist);
goto got_pg;
+ }
/* The OOM killer will not help higher order allocs so fail */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ clear_zonelist_oom(zonelist);
goto nopage;
+ }
out_of_memory(zonelist, gfp_mask, order);
+ clear_zonelist_oom(zonelist);
goto restart;
}
@@ -1850,7 +1861,7 @@ void show_free_areas(void)
K(zone_page_state(zone, NR_INACTIVE)),
K(zone->present_pages),
zone->pages_scanned,
- (zone->all_unreclaimable ? "yes" : "no")
+ (zone_is_all_unreclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3371,7 +3382,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
zone->nr_scan_active = 0;
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
- atomic_set(&zone->reclaim_in_progress, 0);
+ zone->flags = 0;
if (!size)
continue;
diff --git a/mm/readahead.c b/mm/readahead.c
index 2297888..c9c50ca 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -233,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr)
+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
}
+static int __init readahead_init(void)
+{
+ return bdi_init(&default_backing_dev_info);
+}
+subsys_initcall(readahead_init);
+
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index 2b9f413..8990f90 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,7 @@
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within inode_lock in __sync_single_inode)
+ * zone->lock (within radix tree node alloc)
*/
#include <linux/mm.h>
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
anon_vma_free(anon_vma);
}
-static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
- unsigned long flags)
+static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
{
struct anon_vma *anon_vma = data;
diff --git a/mm/shmem.c b/mm/shmem.c
index 8a82342..289dbb0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2328,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode)
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
-static void init_once(void *foo, struct kmem_cache *cachep,
- unsigned long flags)
+static void init_once(struct kmem_cache *cachep, void *foo)
{
struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2344,9 +2343,7 @@ static int init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, 0, init_once);
- if (shmem_inode_cachep == NULL)
- return -ENOMEM;
+ 0, SLAB_PANIC, init_once);
return 0;
}
@@ -2464,6 +2461,10 @@ static int __init init_tmpfs(void)
{
int error;
+ error = bdi_init(&shmem_backing_dev_info);
+ if (error)
+ goto out4;
+
error = init_inodecache();
if (error)
goto out3;
@@ -2488,6 +2489,8 @@ out1:
out2:
destroy_inodecache();
out3:
+ bdi_destroy(&shmem_backing_dev_info);
+out4:
shm_mnt = ERR_PTR(error);
return error;
}
@@ -2540,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &shmem_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &shmem_file_operations);
return file;
close_file:
diff --git a/mm/slab.c b/mm/slab.c
index e34bcb8..3ce9bc0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -267,11 +267,10 @@ struct array_cache {
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
- void *entry[0]; /*
+ void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- * [0] is for gcc 2.95. It should really be [].
*/
};
@@ -408,7 +407,7 @@ struct kmem_cache {
unsigned int dflags; /* dynamic flags */
/* constructor func */
- void (*ctor) (void *, struct kmem_cache *, unsigned long);
+ void (*ctor)(struct kmem_cache *, void *);
/* 5) cache creation/removal */
const char *name;
@@ -2129,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags,
- void (*ctor)(void*, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
@@ -2636,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
* They must also be threaded.
*/
if (cachep->ctor && !(cachep->flags & SLAB_POISON))
- cachep->ctor(objp + obj_offset(cachep), cachep,
- 0);
+ cachep->ctor(cachep, objp + obj_offset(cachep));
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2653,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
cachep->buffer_size / PAGE_SIZE, 0);
#else
if (cachep->ctor)
- cachep->ctor(objp, cachep, 0);
+ cachep->ctor(cachep, objp);
#endif
slab_bufctl(slabp)[i] = i + 1;
}
@@ -3078,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#endif
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
- cachep->ctor(objp, cachep, 0);
+ cachep->ctor(cachep, objp);
#if ARCH_SLAB_MINALIGN
if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de5d556..5bc2ceb 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -499,12 +499,12 @@ struct kmem_cache {
unsigned int size, align;
unsigned long flags;
const char *name;
- void (*ctor)(void *, struct kmem_cache *, unsigned long);
+ void (*ctor)(struct kmem_cache *, void *);
};
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags,
- void (*ctor)(void*, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
struct kmem_cache *c;
@@ -548,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
b = slob_new_page(flags, get_order(c->size), node);
if (c->ctor)
- c->ctor(b, c, 0);
+ c->ctor(c, b);
return b;
}
diff --git a/mm/slub.c b/mm/slub.c
index f426f9b..e29a429 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -980,7 +980,7 @@ __setup("slub_debug", setup_slub_debug);
static unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
/*
* The page->offset field is only 16 bit wide. This is an offset
@@ -1027,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
return flags;
}
@@ -1071,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
{
setup_object_debug(s, page, object);
if (unlikely(s->ctor))
- s->ctor(object, s, 0);
+ s->ctor(s, object);
}
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1085,9 +1085,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
BUG_ON(flags & GFP_SLAB_BUG_MASK);
- if (flags & __GFP_WAIT)
- local_irq_enable();
-
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
if (!page)
@@ -1120,8 +1117,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
page->freelist = start;
page->inuse = 0;
out:
- if (flags & __GFP_WAIT)
- local_irq_disable();
return page;
}
@@ -1505,7 +1500,14 @@ new_slab:
goto load_freelist;
}
+ if (gfpflags & __GFP_WAIT)
+ local_irq_enable();
+
new = new_slab(s, gfpflags, node);
+
+ if (gfpflags & __GFP_WAIT)
+ local_irq_disable();
+
if (new) {
c = get_cpu_slab(s, smp_processor_id());
if (c->page) {
@@ -2039,12 +2041,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
init_kmem_cache_node(n);
atomic_long_inc(&n->nr_slabs);
add_partial(n, page);
-
- /*
- * new_slab() disables interupts. If we do not reenable interrupts here
- * then bootup would continue with interrupts disabled.
- */
- local_irq_enable();
return n;
}
@@ -2215,7 +2211,7 @@ static int calculate_sizes(struct kmem_cache *s)
static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
const char *name, size_t size,
size_t align, unsigned long flags,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
memset(s, 0, kmem_size);
s->name = name;
@@ -2805,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s)
static struct kmem_cache *find_mergeable(size_t size,
size_t align, unsigned long flags, const char *name,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
struct kmem_cache *s;
@@ -2846,7 +2842,7 @@ static struct kmem_cache *find_mergeable(size_t size,
struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags,
- void (*ctor)(void *, struct kmem_cache *, unsigned long))
+ void (*ctor)(struct kmem_cache *, void *))
{
struct kmem_cache *s;
diff --git a/mm/swap.c b/mm/swap.c
index d034b21..a65eff8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -28,6 +28,7 @@
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/backing-dev.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
@@ -547,6 +548,10 @@ void __init swap_setup(void)
{
unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+#ifdef CONFIG_SWAP
+ bdi_init(swapper_space.backing_dev_info);
+#endif
+
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8803471..d436a9c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!dentry)
goto put_memory;
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
error = -ENOSPC;
inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
if (!inode)
- goto close_file;
+ goto put_dentry;
d_instantiate(dentry, inode);
- inode->i_nlink = 0; /* It is unlinked */
+ error = -ENFILE;
+ file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ &ramfs_file_operations);
+ if (!file)
+ goto put_dentry;
- file->f_path.mnt = mntget(shm_mnt);
- file->f_path.dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &ramfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
+ inode->i_nlink = 0; /* It is unlinked */
/* notify everyone as to the change of file size */
error = do_truncate(dentry, size, 0, file);
diff --git a/mm/truncate.c b/mm/truncate.c
index 5cdfbc1..cadc156 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -8,6 +8,7 @@
*/
#include <linux/kernel.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/module.h>
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bbd1946..e147138 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1108,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
- atomic_inc(&zone->reclaim_in_progress);
-
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
@@ -1148,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
}
throttle_vm_writeout(sc->gfp_mask);
-
- atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;
}
@@ -1187,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
note_zone_scanning_priority(zone, priority);
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
sc->all_unreclaimable = 0;
@@ -1368,7 +1364,8 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1403,7 +1400,8 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1424,12 +1422,13 @@ loop_again:
lru_pages);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
+ if (zone_is_all_unreclaimable(zone))
continue;
if (nr_slab == 0 && zone->pages_scanned >=
(zone_page_state(zone, NR_ACTIVE)
+ zone_page_state(zone, NR_INACTIVE)) * 6)
- zone->all_unreclaimable = 1;
+ zone_set_flag(zone,
+ ZONE_ALL_UNRECLAIMABLE);
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -1595,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+ if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
continue;
/* For pass = 0 we don't shrink the active list */
@@ -1897,6 +1896,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
int node_id;
+ int ret;
/*
* Zone reclaim reclaims unmapped file backed pages and
@@ -1914,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
<= zone->min_slab_pages)
return 0;
+ if (zone_is_all_unreclaimable(zone))
+ return 0;
+
/*
- * Avoid concurrent zone reclaims, do not reclaim in a zone that does
- * not have reclaimable pages and if we should not delay the allocation
- * then do not scan.
+ * Do not scan if the allocation should not be delayed.
*/
- if (!(gfp_mask & __GFP_WAIT) ||
- zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0 ||
- (current->flags & PF_MEMALLOC))
+ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
return 0;
/*
@@ -1934,6 +1932,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return 0;
- return __zone_reclaim(zone, gfp_mask, order);
+
+ if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ return 0;
+ ret = __zone_reclaim(zone, gfp_mask, order);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+ return ret;
}
#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3b5e904..4651bf1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -704,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
"\n start_pfn: %lu",
- zone->all_unreclaimable,
+ zone_is_all_unreclaimable(zone),
zone->prev_priority,
zone->zone_start_pfn);
seq_putc(m, '\n');