aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-01-08 16:24:38 +1100
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2009-01-08 16:24:38 +1100
commit24f030175d30f019be41766cdf88c2ff03de19ff (patch)
tree354232a84e82d5a721ed7b1a9af580ff2a59be8f /mm
parent4aa12f7b927c3cac0e0cf3503642597527d0ece0 (diff)
parent9e42d0cf5020aaf217433cad1a224745241d212a (diff)
downloadkernel_goldelico_gta04-24f030175d30f019be41766cdf88c2ff03de19ff.zip
kernel_goldelico_gta04-24f030175d30f019be41766cdf88c2ff03de19ff.tar.gz
kernel_goldelico_gta04-24f030175d30f019be41766cdf88c2ff03de19ff.tar.bz2
Merge commit 'origin/master' into next
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/filemap.c45
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memcontrol.c3
-rw-r--r--mm/memory.c180
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c89
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c24
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c109
-rw-r--r--mm/page-writeback.c245
-rw-r--r--mm/page_alloc.c135
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c16
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c82
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c22
-rw-r--r--mm/swap.c44
-rw-r--r--mm/swap_state.c31
-rw-r--r--mm/swapfile.c576
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c55
-rw-r--r--mm/vmscan.c147
-rw-r--r--mm/vmstat.c4
36 files changed, 1234 insertions, 898 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f..a5b7781 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
example on NUMA systems to put pages nearer to the processors accessing
the page.
-config RESOURCES_64BIT
- bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
- default 64BIT
- help
- This option allows memory and IO resources to be 64 bit.
-
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 51c2770..72255be 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
- readahead.o swap.o truncate.o vmscan.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b..8e85874 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- long background_thresh;
- long dirty_thresh;
- long bdi_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_prop_frac = PROP_FRAC_BASE;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
goto err;
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891..51a0ccf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
BUG_ON(!size);
BUG_ON(align & (align - 1));
BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
if (!bdata->node_bootmem_map)
return NULL;
- bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
- bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
- align, goal, limit);
-
min = bdata->node_min_pfn;
max = bdata->node_low_pfn;
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..2f55a1e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
+ .nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
@@ -741,7 +741,14 @@ repeat:
page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ /*
+ * We want a regular kernel memory (not highmem or DMA etc)
+ * allocation for the radix tree nodes, but we need to honour
+ * the context-specific requirements the caller has asked for.
+ * GFP_RECLAIM_MASK collects those requirements.
+ */
+ err = add_to_page_cache_lru(page, mapping, index,
+ (gfp_mask & GFP_RECLAIM_MASK));
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
return NULL;
}
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
page_cache_release(page);
page = NULL;
}
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
- retval = filemap_write_and_wait(mapping);
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
@@ -1530,7 +1538,6 @@ retry_find:
/*
* Found the page and have a reference on it.
*/
- mark_page_accessed(page);
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1766,7 +1773,7 @@ int should_remove_suid(struct dentry *dentry)
if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
kill |= ATTR_KILL_SGID;
- if (unlikely(kill && !capable(CAP_FSETID)))
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
return kill;
return 0;
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (count != ocount)
*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
- /*
- * Unmap all mmappings of the file up-front.
- *
- * This will cause any pte dirty bits to be propagated into the
- * pageframes for the subsequent filemap_write_and_wait().
- */
write_len = iov_length(iov, *nr_segs);
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, pos, write_len, 0);
- written = filemap_write_and_wait(mapping);
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
goto out;
@@ -2140,19 +2139,24 @@ EXPORT_SYMBOL(generic_file_direct_write);
* Find or create a page at the given pagecache position. Return the locked
* page. This function is specifically for buffered writes.
*/
-struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
{
int status;
struct page *page;
+ gfp_t gfp_notmask = 0;
+ if (flags & AOP_FLAG_NOFS)
+ gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
if (likely(page))
return page;
- page = page_cache_alloc(mapping);
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
if (!page)
return NULL;
- status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ status = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & ~gfp_notmask);
if (unlikely(status)) {
page_cache_release(page);
if (status == -EEXIST)
@@ -2161,7 +2165,7 @@ repeat:
}
return page;
}
-EXPORT_SYMBOL(__grab_cache_page);
+EXPORT_SYMBOL(grab_cache_page_write_begin);
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
@@ -2286,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
- status = filemap_write_and_wait(mapping);
+ status = filemap_write_and_wait_range(mapping,
+ pos, pos + written - 1);
return written ? written : status;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167df..0c04615 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca7..62d5bbd 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53..618e983 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
}
/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate;
+
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ hstate = hstate_vma(vma);
+
+ return 1UL << (hstate->order + PAGE_SHIFT);
+}
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+#endif
+
+/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES))
- return clear_gigantic_page(page, addr, sz);
+ if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, sz);
+ return;
+ }
might_sleep();
for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
int i;
struct hstate *h = hstate_vma(vma);
- if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
- return copy_gigantic_page(dst, src, addr, vma);
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src, addr, vma);
+ return;
+ }
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
return page;
}
-__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
* puts them into the mem_map).
*/
m = addr;
- if (m)
- goto found;
+ goto found;
}
hstate_next_node(h);
nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc..478223b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
+extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define GUP_FLAGS_WRITE 0x1
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+#define GUP_FLAGS_IGNORE_SIGKILL 0x8
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7..51ee965 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
return 0;
}
-int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
{
int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
diff --git a/mm/memory.c b/mm/memory.c
index 0a2010a..3f8fa06 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/swapops.h>
+#include <linux/elf.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -59,9 +62,6 @@
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-#include <linux/swapops.h>
-#include <linux/elf.h>
-
#include "internal.h"
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
*
* The calling function must still handle the error.
*/
-static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
- unsigned long vaddr)
-{
- printk(KERN_ERR "Bad pte = %08llx, process = %s, "
- "vm_flags = %lx, vaddr = %lx\n",
- (long long)pte_val(pte),
- (vma->vm_mm == current->mm ? current->comm : "???"),
- vma->vm_flags, vaddr);
+static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, struct page *page)
+{
+ pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ struct address_space *mapping;
+ pgoff_t index;
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
+
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ return;
+ }
+ if (nr_unshown) {
+ printk(KERN_ALERT
+ "BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
+ index = linear_page_index(vma, addr);
+
+ printk(KERN_ALERT
+ "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ if (page) {
+ printk(KERN_ALERT
+ "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+ page, (void *)page->flags, page_count(page),
+ page_mapcount(page), page->mapping, page->index);
+ }
+ printk(KERN_ALERT
+ "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ /*
+ * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
+ */
+ if (vma->vm_ops)
+ print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
+ (unsigned long)vma->vm_ops->fault);
+ if (vma->vm_file && vma->vm_file->f_op)
+ print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
+ (unsigned long)vma->vm_file->f_op->mmap);
dump_stack();
+ add_taint(TAINT_BAD_PAGE);
}
static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
- unsigned long pfn;
+ unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte))) {
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- return pte_page(pte);
- }
- VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+ if (likely(!pte_special(pte)))
+ goto check_pfn;
+ if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/* !HAVE_PTE_SPECIAL case follows: */
- pfn = pte_pfn(pte);
-
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
- VM_BUG_ON(!pfn_valid(pfn));
+check_pfn:
+ if (unlikely(pfn > highest_memmap_pfn)) {
+ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
+ }
/*
* NOTE! We still have PageReserved() pages in the page tables.
- *
* eg. VDSO mappings can cause them to exist.
*/
out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
else {
if (pte_dirty(ptent))
set_page_dirty(page);
- if (pte_young(ptent))
- SetPageReferenced(page);
+ if (pte_young(ptent) &&
+ likely(!VM_SequentialReadHint(vma)))
+ mark_page_accessed(page);
file_rss--;
}
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
tlb_remove_page(tlb, page);
continue;
}
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
*/
if (unlikely(details))
continue;
- if (!pte_file(ptent))
- free_swap_and_cache(pte_to_swp_entry(ptent));
+ if (pte_file(ptent)) {
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else if
+ (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
+ print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int write = !!(flags & GUP_FLAGS_WRITE);
int force = !!(flags & GUP_FLAGS_FORCE);
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+ int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
if (len <= 0)
return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct page *page;
/*
- * If tsk is ooming, cut off its access to large memory
- * allocations. It has a pending SIGKILL, but it can't
- * be processed until returning to user space.
+ * If we have a pending SIGKILL, don't keep faulting
+ * pages and potentially allocating memory, unless
+ * current is handling munlock--e.g., on exit. In
+ * that case, we are not allocating memory. Rather,
+ * we're only unlocking already resident/mapped pages.
*/
- if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
- return i ? i : -ENOMEM;
+ if (unlikely(!ignore_sigkill &&
+ fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
if (write)
foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* do_wp_page has broken COW when necessary,
* even if maybe_mkwrite decided not to set
* pte_write. We can thus safely do subsequent
- * page lookups as if they were reads.
+ * page lookups as if they were reads. But only
+ * do so when looping for pte_write is futile:
+ * in some cases userspace may also be wanting
+ * to write to the gotten user page, which a
+ * read fault here might prevent (a readonly
+ * page might get reCOWed by userspace write).
*/
- if (ret & VM_FAULT_WRITE)
+ if ((ret & VM_FAULT_WRITE) &&
+ !(vma->vm_flags & VM_WRITE))
foll_flags &= ~FOLL_WRITE;
cond_resched();
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
BUG_ON(pmd_huge(*pmd));
+ arch_enter_lazy_mmu_mode();
+
token = pmd_pgtable(*pmd);
do {
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
+
if (mm != &init_mm)
pte_unmap_unlock(pte-1, ptl);
return err;
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* not dirty accountable.
*/
if (PageAnon(old_page)) {
- if (trylock_page(old_page)) {
- reuse = can_share_swap_page(old_page);
- unlock_page(old_page);
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ page_cache_release(old_page);
+ goto unlock;
+ }
+ page_cache_release(old_page);
}
+ reuse = reuse_swap_page(old_page);
+ unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
/*
@@ -1943,11 +2025,7 @@ gotten:
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- SetPageSwapBacked(new_page);
- lru_cache_add_active_or_unevictable(new_page, vma);
page_add_new_anon_rmap(new_page, vma, address);
-
-//TODO: is this safe? do_anonymous_page() does it this way.
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
@@ -1973,7 +2051,7 @@ gotten:
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, vma);
+ page_remove_rmap(old_page);
}
/* Free the old page.. */
@@ -2266,7 +2344,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
}
- if (inode->i_op && inode->i_op->truncate)
+ if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
@@ -2286,7 +2364,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
* a way to truncate a range of blocks (punch a hole) -
* we should return failure right now.
*/
- if (!inode->i_op || !inode->i_op->truncate_range)
+ if (!inode->i_op->truncate_range)
return -ENOSYS;
mutex_lock(&inode->i_mutex);
@@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter(mm, anon_rss);
pte = mk_pte(page, vma->vm_page_prot);
- if (write_access && can_share_swap_page(page)) {
+ if (write_access && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
write_access = 0;
}
@@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swap_free(entry);
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- remove_exclusive_swap_page(page);
+ try_to_free_swap(page);
unlock_page(page);
if (write_access) {
@@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
- SetPageSwapBacked(page);
- lru_cache_add_active_or_unevictable(page, vma);
page_add_new_anon_rmap(page, vma, address);
set_pte_at(mm, address, page_table, entry);
@@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter(mm, anon_rss);
- SetPageSwapBacked(page);
- lru_cache_add_active_or_unevictable(page, vma);
page_add_new_anon_rmap(page, vma, address);
} else {
inc_mm_counter(mm, file_rss);
@@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
}
}
-//TODO: is this safe? do_anonymous_page() does it this way.
set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return 0;
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
- !(vma->vm_flags & VM_CAN_NONLINEAR))) {
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
- print_bad_pte(vma, orig_pte, address);
+ print_bad_pte(vma, address, orig_pte, NULL);
return VM_FAULT_OOM;
}
@@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
{
resource_size_t phys_addr;
unsigned long prot = 0;
- void *maddr;
+ void __iomem *maddr;
int offset = addr & (PAGE_SIZE-1);
if (follow_phys(vma, addr, write, &prot, &phys_addr))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b173711..c083cf5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
return 0;
}
-static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(int nid, struct zone *zone,
+ unsigned long phys_start_pfn)
{
int nr_pages = PAGES_PER_SECTION;
int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
if (ret < 0)
return ret;
- return register_new_memory(__pfn_to_section(phys_start_pfn));
+ return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
{
unsigned long i;
int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
}
static struct page *
-hotremove_migrate_alloc(struct page *page,
- unsigned long private,
- int **x)
+hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
{
- /* This should be improoooooved!! */
- return alloc_page(GFP_HIGHUSER_PAGECACHE);
+ /* This should be improooooved!! */
+ return alloc_page(GFP_HIGHUSER_MOVABLE);
}
-
#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab..5537398 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
* Now we know that no one else is looking at the page.
*/
get_page(newpage); /* add cache reference */
-#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
-#endif
radix_tree_replace_slot(pslot, newpage);
@@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
mlock_migrate_page(newpage, page);
-#ifdef CONFIG_SWAP
ClearPageSwapCache(page);
-#endif
ClearPagePrivate(page);
set_page_private(page, 0);
/* page->mapping contains a flag for PageAnon() */
@@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
struct vm_area_struct *vma;
struct page *page;
- /*
- * A valid page pointer that will not match any of the
- * pages that will be moved.
- */
- pp->page = ZERO_PAGE(0);
-
err = -EFAULT;
vma = find_vma(mm, pp->addr);
if (!vma || !vma_migratable(vma))
@@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
const int __user *nodes,
int __user *status, int flags)
{
- struct page_to_node *pm = NULL;
+ struct page_to_node *pm;
nodemask_t task_nodes;
- int err = 0;
- int i;
+ unsigned long chunk_nr_pages;
+ unsigned long chunk_start;
+ int err;
task_nodes = cpuset_mems_allowed(task);
- /* Limit nr_pages so that the multiplication may not overflow */
- if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
- err = -E2BIG;
- goto out;
- }
-
- pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
- if (!pm) {
- err = -ENOMEM;
+ err = -ENOMEM;
+ pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
+ if (!pm)
goto out;
- }
-
/*
- * Get parameters from user space and initialize the pm
- * array. Return various errors if the user did something wrong.
+ * Store a chunk of page_to_node array in a page,
+ * but keep the last one as a marker
*/
- for (i = 0; i < nr_pages; i++) {
- const void __user *p;
+ chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
- err = -EFAULT;
- if (get_user(p, pages + i))
- goto out_pm;
+ for (chunk_start = 0;
+ chunk_start < nr_pages;
+ chunk_start += chunk_nr_pages) {
+ int j;
- pm[i].addr = (unsigned long)p;
- if (nodes) {
+ if (chunk_start + chunk_nr_pages > nr_pages)
+ chunk_nr_pages = nr_pages - chunk_start;
+
+ /* fill the chunk pm with addrs and nodes from user-space */
+ for (j = 0; j < chunk_nr_pages; j++) {
+ const void __user *p;
int node;
- if (get_user(node, nodes + i))
+ err = -EFAULT;
+ if (get_user(p, pages + j + chunk_start))
+ goto out_pm;
+ pm[j].addr = (unsigned long) p;
+
+ if (get_user(node, nodes + j + chunk_start))
goto out_pm;
err = -ENODEV;
@@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
if (!node_isset(node, task_nodes))
goto out_pm;
- pm[i].node = node;
- } else
- pm[i].node = 0; /* anything to not match MAX_NUMNODES */
- }
- /* End marker */
- pm[nr_pages].node = MAX_NUMNODES;
+ pm[j].node = node;
+ }
+
+ /* End marker for this chunk */
+ pm[chunk_nr_pages].node = MAX_NUMNODES;
+
+ /* Migrate this chunk */
+ err = do_move_page_to_node_array(mm, pm,
+ flags & MPOL_MF_MOVE_ALL);
+ if (err < 0)
+ goto out_pm;
- err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
- if (err >= 0)
/* Return status information */
- for (i = 0; i < nr_pages; i++)
- if (put_user(pm[i].status, status + i))
+ for (j = 0; j < chunk_nr_pages; j++)
+ if (put_user(pm[j].status, status + j + chunk_start)) {
err = -EFAULT;
+ goto out_pm;
+ }
+ }
+ err = 0;
out_pm:
- vfree(pm);
+ free_page((unsigned long)pm);
out:
return err;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56..e125156 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
(atomic_read(&mm->mm_users) != 0));
/*
- * mlock: don't page populate if page has PROT_NONE permission.
- * munlock: the pages always do munlock althrough
- * its has PROT_NONE permission.
+ * mlock: don't page populate if vma has PROT_NONE permission.
+ * munlock: always do munlock although the vma has PROT_NONE
+ * permission, or SIGKILL is pending.
*/
if (!mlock)
- gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+ gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
+ GUP_FLAGS_IGNORE_SIGKILL;
if (vma->vm_flags & VM_WRITE)
gup_flags |= GUP_FLAGS_WRITE;
diff --git a/mm/mmap.c b/mm/mmap.c
index d4855a6..a910c045 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,7 +3,7 @@
*
* Written by obz.
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
#include <linux/slab.h>
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
static void __vma_link_file(struct vm_area_struct *vma)
{
- struct file * file;
+ struct file *file;
file = vma->vm_file;
if (file) {
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
* insert vm structure into list and rbtree and anon_vma,
* but it has already been inserted into prio_tree earlier.
*/
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *__vma, *prev;
+ struct rb_node **rb_link, *rb_parent;
__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
* The caller must hold down_write(current->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
+ struct rb_node *rb_node;
if (!mm)
goto out;
@@ -1541,7 +1540,7 @@ out:
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm)
arch_exit_mmap(mm);
mmu_notifier_release(mm);
+ if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */
+ return;
+
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) {
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index fded06f..d0f6e7c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -4,7 +4,7 @@
* (C) Copyright 1994 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
@@ -22,6 +22,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(mm, addr, pte, ptent);
-#ifdef CONFIG_MIGRATION
- } else if (!pte_file(oldpte)) {
+ } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
-#endif
}
-
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
diff --git a/mm/mremap.c b/mm/mremap.c
index 58a2908..646de95 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,7 +3,7 @@
*
* (C) Copyright 1996 Linus Torvalds
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
diff --git a/mm/msync.c b/mm/msync.c
index 144a757..07dae08 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = do_fsync(file, 0);
+ error = vfs_fsync(file, file->f_path.dentry, 0);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7695dc8..1c28ea3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -86,7 +86,7 @@ do_expand:
i_size_write(inode, offset);
out_truncate:
- if (inode->i_op && inode->i_op->truncate)
+ if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
out_sig:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9af..6b9e758 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks;
-static DEFINE_SPINLOCK(zone_scan_mutex);
+static DEFINE_SPINLOCK(zone_scan_lock);
/* #define DEBUG */
/**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
current->comm, gfp_mask, order, current->oomkilladj);
+ task_lock(current);
+ cpuset_print_task_mems_allowed(current);
+ task_unlock(current);
dump_stack();
show_mem();
if (sysctl_oom_dump_tasks)
@@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
struct zone *zone;
int ret = 1;
- spin_lock(&zone_scan_mutex);
+ spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
if (zone_is_oom_locked(zone)) {
ret = 0;
@@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
/*
- * Lock each zone in the zonelist under zone_scan_mutex so a
+ * Lock each zone in the zonelist under zone_scan_lock so a
* parallel invocation of try_set_zone_oom() doesn't succeed
* when it shouldn't.
*/
@@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
}
out:
- spin_unlock(&zone_scan_mutex);
+ spin_unlock(&zone_scan_lock);
return ret;
}
@@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
struct zoneref *z;
struct zone *zone;
- spin_lock(&zone_scan_mutex);
+ spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
zone_clear_flag(zone, ZONE_OOM_LOCKED);
}
- spin_unlock(&zone_scan_mutex);
+ spin_unlock(&zone_scan_lock);
+}
+
+/*
+ * Must be called with tasklist_lock held for read.
+ */
+static void __out_of_memory(gfp_t gfp_mask, int order)
+{
+ if (sysctl_oom_kill_allocating_task) {
+ oom_kill_process(current, gfp_mask, order, 0, NULL,
+ "Out of memory (oom_kill_allocating_task)");
+
+ } else {
+ unsigned long points;
+ struct task_struct *p;
+
+retry:
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points, NULL);
+
+ if (PTR_ERR(p) == -1UL)
+ return;
+
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ panic("Out of memory and no killable processes...\n");
+ }
+
+ if (oom_kill_process(p, gfp_mask, order, points, NULL,
+ "Out of memory"))
+ goto retry;
+ }
+}
+
+/*
+ * pagefault handler calls into here because it is out of memory but
+ * doesn't know exactly how or why.
+ */
+void pagefault_out_of_memory(void)
+{
+ unsigned long freed = 0;
+
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ if (freed > 0)
+ /* Got some memory back in the last second. */
+ return;
+
+ if (sysctl_panic_on_oom)
+ panic("out of memory from page fault. panic_on_oom is selected.\n");
+
+ read_lock(&tasklist_lock);
+ __out_of_memory(0, 0); /* unknown gfp_mask and order */
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Give "p" a good chance of killing itself before we
+ * retry to allocate memory.
+ */
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
}
/**
@@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
*/
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
- struct task_struct *p;
- unsigned long points = 0;
unsigned long freed = 0;
enum oom_constraint constraint;
@@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, points, NULL,
+ oom_kill_process(current, gfp_mask, order, 0, NULL,
"No available memory (MPOL_BIND)");
break;
@@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
panic("out of memory. panic_on_oom is selected\n");
/* Fall-through */
case CONSTRAINT_CPUSET:
- if (sysctl_oom_kill_allocating_task) {
- oom_kill_process(current, gfp_mask, order, points, NULL,
- "Out of memory (oom_kill_allocating_task)");
- break;
- }
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points, NULL);
-
- if (PTR_ERR(p) == -1UL)
- goto out;
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- panic("Out of memory and no killable processes...\n");
- }
-
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
- goto retry;
-
+ __out_of_memory(gfp_mask, order);
break;
}
-out:
read_unlock(&tasklist_lock);
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35..b493db7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
int dirty_background_ratio = 5;
/*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
+
+/*
* free highmem will not be subtracted from the total free memory
* for calculating free ratios if vm_highmem_is_dirtyable is true
*/
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
int vm_dirty_ratio = 10;
/*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
+
+/*
* The interval between `kupdate'-style writebacks, in jiffies
*/
int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
{
unsigned long dirty_total;
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ if (vm_dirty_bytes)
+ dirty_total = vm_dirty_bytes / PAGE_SIZE;
+ else
+ dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ 100;
return 2 + ilog2(dirty_total - 1);
}
/*
- * update the period when the dirty ratio changes.
+ * update the period when the dirty threshold changes.
*/
+static void update_completion_period(void)
+{
+ int shift = calc_period_shift();
+ prop_change_shift(&vm_completions, shift);
+ prop_change_shift(&vm_dirties, shift);
+}
+
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_bytes = 0;
+ return ret;
+}
+
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_ratio = 0;
+ return ret;
+}
+
int dirty_ratio_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
- int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- int shift = calc_period_shift();
- prop_change_shift(&vm_completions, shift);
- prop_change_shift(&vm_dirties, shift);
+ update_completion_period();
+ vm_dirty_bytes = 0;
+ }
+ return ret;
+}
+
+
+int dirty_bytes_handler(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_bytes = vm_dirty_bytes;
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+ update_completion_period();
+ vm_dirty_ratio = 0;
}
return ret;
}
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
}
void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
- struct backing_dev_info *bdi)
+get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
+ unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
{
- int background_ratio; /* Percentages */
- int dirty_ratio;
- long background;
- long dirty;
+ unsigned long background;
+ unsigned long dirty;
unsigned long available_memory = determine_dirtyable_memory();
struct task_struct *tsk;
- dirty_ratio = vm_dirty_ratio;
- if (dirty_ratio < 5)
- dirty_ratio = 5;
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else {
+ int dirty_ratio;
- background_ratio = dirty_background_ratio;
- if (background_ratio >= dirty_ratio)
- background_ratio = dirty_ratio / 2;
+ dirty_ratio = vm_dirty_ratio;
+ if (dirty_ratio < 5)
+ dirty_ratio = 5;
+ dirty = (dirty_ratio * available_memory) / 100;
+ }
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
- background = (background_ratio * available_memory) / 100;
- dirty = (dirty_ratio * available_memory) / 100;
+ if (background >= dirty)
+ background = dirty / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
{
long nr_reclaimable, bdi_nr_reclaimable;
long nr_writeback, bdi_nr_writeback;
- long background_thresh;
- long dirty_thresh;
- long bdi_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long write_chunk = sync_writeback_pages();
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
void throttle_vm_writeout(gfp_t gfp_mask)
{
- long background_thresh;
- long dirty_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
for ( ; ; ) {
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
};
for ( ; ; ) {
- long background_thresh;
- long dirty_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
int done = 0;
struct pagevec pvec;
int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
- int scanned = 0;
+ pgoff_t done_index;
+ int cycled;
int range_whole = 0;
long nr_to_write = wbc->nr_to_write;
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- scanned = 1;
+ cycled = 1; /* ignore range_cyclic tests */
}
retry:
- while (!done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
- unsigned i;
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
/*
- * At this point we hold neither mapping->tree_lock nor
- * lock on the page itself: the page may be truncated or
- * invalidated (changing page->mapping to NULL), or even
- * swizzled back from swapper_space to tmpfs file
- * mapping
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
*/
+ if (page->index > end) {
+ /*
+ * can't be range_cyclic (1st pass) because
+ * end == -1 in that case.
+ */
+ done = 1;
+ break;
+ }
+
+ done_index = page->index + 1;
+
lock_page(page);
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
if (unlikely(page->mapping != mapping)) {
+continue_unlock:
unlock_page(page);
continue;
}
- if (!wbc->range_cyclic && page->index > end) {
- done = 1;
- unlock_page(page);
- continue;
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
}
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
-
- if (PageWriteback(page) ||
- !clear_page_dirty_for_io(page)) {
- unlock_page(page);
- continue;
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
}
- ret = (*writepage)(page, wbc, data);
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
- unlock_page(page);
- ret = 0;
+ ret = (*writepage)(page, wbc, data);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ done = 1;
+ break;
+ }
+ }
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ wbc->nr_to_write--;
+ if (wbc->nr_to_write <= 0) {
+ done = 1;
+ break;
+ }
}
- if (ret || (--nr_to_write <= 0))
- done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
+ break;
}
}
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done) {
+ if (!cycled) {
/*
+ * range_cyclic:
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
- scanned = 1;
+ cycled = 1;
index = 0;
+ end = writeback_index - 1;
goto retry;
}
if (!wbc->no_nrwrite_index_update) {
if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
- mapping->writeback_index = index;
+ mapping->writeback_index = done_index;
wbc->nr_to_write = nr_to_write;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac014..7bf22e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
-long nr_swap_pages;
+unsigned long highest_memmap_pfn __read_mostly;
int percpu_pagelist_fraction;
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
static void bad_page(struct page *page)
{
- printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
- "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
- current->comm, page, (int)(2*sizeof(unsigned long)),
- (unsigned long)page->flags, page->mapping,
- page_mapcount(page), page_count(page));
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
+
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ goto out;
+ }
+ if (nr_unshown) {
+ printk(KERN_ALERT
+ "BUG: Bad page state: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ printk(KERN_ALERT
+ "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+ page, (void *)page->flags, page_count(page),
+ page_mapcount(page), page->mapping, page->index);
- printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
- KERN_EMERG "Backtrace:\n");
dump_stack();
- page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
- set_page_count(page, 0);
- reset_page_mapcount(page);
- page->mapping = NULL;
+out:
+ /* Leave bad fields for debug, except PageBuddy could make trouble */
+ __ClearPageBuddy(page);
add_taint(TAINT_BAD_PAGE);
}
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
}
#endif
-static void destroy_compound_page(struct page *page, unsigned long order)
+static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ int bad = 0;
- if (unlikely(compound_order(page) != order))
+ if (unlikely(compound_order(page) != order) ||
+ unlikely(!PageHead(page))) {
bad_page(page);
+ bad++;
+ }
- if (unlikely(!PageHead(page)))
- bad_page(page);
__ClearPageHead(page);
+
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p) |
- (p->first_page != page)))
+ if (unlikely(!PageTail(p) | (p->first_page != page))) {
bad_page(page);
+ bad++;
+ }
__ClearPageTail(p);
}
+
+ return bad;
}
static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
int migratetype = get_pageblock_migratetype(page);
if (unlikely(PageCompound(page)))
- destroy_compound_page(page, order);
+ if (unlikely(destroy_compound_page(page, order)))
+ return;
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(page_count(page) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
bad_page(page);
- if (PageDirty(page))
- __ClearPageDirty(page);
- if (PageSwapBacked(page))
- __ClearPageSwapBacked(page);
- /*
- * For now, we report if PG_reserved was found set, but do not
- * clear it, and do not free the page. But we shall soon need
- * to do more, for when the ZERO_PAGE count wraps negative.
- */
- return PageReserved(page);
+ return 1;
+ }
+ if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ return 0;
}
/*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int i;
- int reserved = 0;
+ int bad = 0;
for (i = 0 ; i < (1 << order) ; ++i)
- reserved += free_pages_check(page + i);
- if (reserved)
+ bad += free_pages_check(page + i);
+ if (bad)
return;
if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(page_count(page) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
bad_page(page);
-
- /*
- * For now, we report if PG_reserved was found set, but do not
- * clear it, and do not allocate the page: as a safety net.
- */
- if (PageReserved(page))
return 1;
+ }
- page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
- 1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
-#ifdef CONFIG_UNEVICTABLE_LRU
- | 1 << PG_mlocked
-#endif
- );
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long pfn;
struct zone *z;
+ if (highest_memmap_pfn < end_pfn - 1)
+ highest_memmap_pfn = end_pfn - 1;
+
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize) {
+ if (usemapsize)
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
- memset(zone->pageblock_flags, 0, usemapsize);
- }
}
#else
static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
+ if (memmap_pages)
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
* 1TB 101 10GB
* 10TB 320 32GB
*/
-void setup_per_zone_inactive_ratio(void)
+static void setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page *pfn_to_page(unsigned long pfn)
-{
- return __pfn_to_page(pfn);
-}
-unsigned long page_to_pfn(struct page *page)
-{
- return __page_to_pfn(page);
-}
-EXPORT_SYMBOL(pfn_to_page);
-EXPORT_SYMBOL(page_to_pfn);
-#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff7..d6507a6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
}
/* __alloc_bootmem...() is protected by !slab_available() */
-int __init_refok init_section_page_cgroup(unsigned long pfn)
+static int __init_refok init_section_page_cgroup(unsigned long pfn)
{
struct mem_section *section;
struct page_cgroup *base, *pc;
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c448..dc6ce0a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
struct bio *bio;
int ret = 0, rw = WRITE;
- if (remove_exclusive_swap_page(page)) {
+ if (try_to_free_swap(page)) {
unlock_page(page);
goto out;
}
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
struct bio *bio;
int ret = 0;
- BUG_ON(!PageLocked(page));
- BUG_ON(PageUptodate(page));
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageUptodate(page));
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
diff --git a/mm/pdflush.c b/mm/pdflush.c
index a0a14c4..15de509 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -172,7 +172,16 @@ static int __pdflush(struct pdflush_work *my_work)
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
- cpumask_t cpus_allowed;
+ cpumask_var_t cpus_allowed;
+
+ /*
+ * Since the caller doesn't even check kthread_run() worked, let's not
+ * freak out too much if this fails.
+ */
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
+ return 0;
+ }
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
@@ -187,8 +196,9 @@ static int pdflush(void *dummy)
* This is needed as pdflush's are dynamically created and destroyed.
* The boottime pdflush's are easily placed w/o these 2 lines.
*/
- cpuset_cpus_allowed(current, &cpus_allowed);
- set_cpus_allowed_ptr(current, &cpus_allowed);
+ cpuset_cpus_allowed(current, cpus_allowed);
+ set_cpus_allowed_ptr(current, cpus_allowed);
+ free_cpumask_var(cpus_allowed);
return __pdflush(&my_work);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 1099394..ac4af8c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
#include <asm/tlbflush.h>
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+static struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
return NULL;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+static void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
goto out_unmap;
}
- if (ptep_clear_flush_young_notify(vma, address, pte))
- referenced++;
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!VM_SequentialReadHint(vma)))
+ referenced++;
+ }
/* Pretend the page is referenced if the task has the
swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ SetPageSwapBacked(page);
+ atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
__page_set_anon_rmap(page, vma, address);
+ if (page_evictable(page, vma))
+ lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+ else
+ add_page_to_unevictable_list(page);
}
/**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
*/
void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
{
- BUG_ON(page_mapcount(page) == 0);
if (PageAnon(page))
__page_check_anon_rmap(page, vma, address);
atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
- * @vma: the vm area in which the mapping is removed
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+void page_remove_rmap(struct page *page)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
- if (unlikely(page_mapcount(page) < 0)) {
- printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
- printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
- printk (KERN_EMERG " page->flags = %lx\n", page->flags);
- printk (KERN_EMERG " page->count = %x\n", page_count(page));
- printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
- print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
- if (vma->vm_ops) {
- print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
- }
- if (vma->vm_file && vma->vm_file->f_op)
- print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
- BUG();
- }
-
/*
* Now that the last pte has gone, s390 must transfer dirty
* flag from storage key to struct page. We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
-#ifdef CONFIG_MIGRATION
- } else {
+ } else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
BUG_ON(!migration);
entry = make_migration_entry(page, pte_write(pteval));
-#endif
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
- } else
-#ifdef CONFIG_MIGRATION
- if (migration) {
+ } else if (PAGE_MIGRATION && migration) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
-#endif
dec_mm_counter(mm, file_rss);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
page_cache_release(page);
out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (pte_dirty(pteval))
set_page_dirty(page);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d48..5941f98 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*
+ * tiny-shmem:
+ * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
+ *
* This file is released under the GPL.
*/
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+
+static struct vfsmount *shm_mnt;
+
+#ifdef CONFIG_SHMEM
/*
* This virtual memory filesystem is heavily based on the ramfs. It
* extends ramfs by the ability to use swap and honor resource limits
* which makes it a completely usable filesystem.
*/
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/generic_acl.h>
-#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
-#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/vfs.h>
#include <linux/blkdev.h>
@@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
- mark_page_accessed(vmf->page);
return ret | VM_FAULT_LOCKED;
}
@@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = {
.get_sb = shmem_get_sb,
.kill_sb = kill_litter_super,
};
-static struct vfsmount *shm_mnt;
static int __init init_tmpfs(void)
{
@@ -2525,7 +2531,51 @@ out4:
shm_mnt = ERR_PTR(error);
return error;
}
-module_init(init_tmpfs)
+
+#else /* !CONFIG_SHMEM */
+
+/*
+ * tiny-shmem: simple shmemfs and tmpfs using ramfs code
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+
+#include <linux/ramfs.h>
+
+static struct file_system_type tmpfs_fs_type = {
+ .name = "tmpfs",
+ .get_sb = ramfs_get_sb,
+ .kill_sb = kill_litter_super,
+};
+
+static int __init init_tmpfs(void)
+{
+ BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+
+ shm_mnt = kern_mount(&tmpfs_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
+ return 0;
+}
+
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+ return 0;
+}
+
+#define shmem_file_operations ramfs_file_operations
+#define shmem_vm_ops generic_file_vm_ops
+#define shmem_get_inode ramfs_get_inode
+#define shmem_acct_size(a, b) 0
+#define shmem_unacct_size(a, b) do {} while (0)
+#define SHMEM_MAX_BYTES LLONG_MAX
+
+#endif /* CONFIG_SHMEM */
+
+/* common code */
/**
* shmem_file_setup - get an unlinked file living in tmpfs
@@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
if (!inode)
goto close_file;
+#ifdef CONFIG_SHMEM
SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+#endif
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
- &shmem_file_operations);
+ &shmem_file_operations);
+
+#ifndef CONFIG_MMU
+ error = ramfs_nommu_expand_for_mapping(inode, size);
+ if (error)
+ goto close_file;
+#endif
return file;
close_file:
@@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_ops = &shmem_vm_ops;
return 0;
}
+
+module_init(init_tmpfs)
diff --git a/mm/slab.c b/mm/slab.c
index f97e564..ddc41f3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2157,7 +2157,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/*
* We use cache_chain_mutex to ensure a consistent view of
- * cpu_online_map as well. Please see cpuup_callback
+ * cpu_online_mask as well. Please see cpuup_callback
*/
get_online_cpus();
mutex_lock(&cache_chain_mutex);
diff --git a/mm/slub.c b/mm/slub.c
index 6cb7ad1..6392ae5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1970,7 +1970,7 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu,
kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
-static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
+static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
int cpu, gfp_t flags)
@@ -2045,13 +2045,13 @@ static void init_alloc_cpu_cpu(int cpu)
{
int i;
- if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
+ if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
return;
for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
- cpu_set(cpu, kmem_cach_cpu_free_init_once);
+ cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
}
static void __init init_alloc_cpu(void)
@@ -2254,7 +2254,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
* tracking information or the free pointer be
- * corrupted if an user writes before the start
+ * corrupted if a user writes before the start
* of the object.
*/
size += sizeof(void *);
@@ -3451,7 +3451,7 @@ struct location {
long max_time;
long min_pid;
long max_pid;
- cpumask_t cpus;
+ DECLARE_BITMAP(cpus, NR_CPUS);
nodemask_t nodes;
};
@@ -3526,7 +3526,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (track->pid > l->max_pid)
l->max_pid = track->pid;
- cpu_set(track->cpu, l->cpus);
+ cpumask_set_cpu(track->cpu,
+ to_cpumask(l->cpus));
}
node_set(page_to_nid(virt_to_page(track)), l->nodes);
return 1;
@@ -3556,8 +3557,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->max_time = age;
l->min_pid = track->pid;
l->max_pid = track->pid;
- cpus_clear(l->cpus);
- cpu_set(track->cpu, l->cpus);
+ cpumask_clear(to_cpumask(l->cpus));
+ cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
node_set(page_to_nid(virt_to_page(track)), l->nodes);
return 1;
@@ -3638,11 +3639,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, " pid=%ld",
l->min_pid);
- if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
+ if (num_online_cpus() > 1 &&
+ !cpumask_empty(to_cpumask(l->cpus)) &&
len < PAGE_SIZE - 60) {
len += sprintf(buf + len, " cpus=");
len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
- l->cpus);
+ to_cpumask(l->cpus));
}
if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
diff --git a/mm/swap.c b/mm/swap.c
index b135ec9..ba2c0e8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page)
spin_unlock_irq(&zone->lru_lock);
}
-/**
- * lru_cache_add_active_or_unevictable
- * @page: the page to be added to LRU
- * @vma: vma in which page is mapped for determining reclaimability
- *
- * place @page on active or unevictable LRU list, depending on
- * page_evictable(). Note that if the page is not evictable,
- * it goes directly back onto it's zone's unevictable list. It does
- * NOT use a per cpu pagevec.
- */
-void lru_cache_add_active_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
-{
- if (page_evictable(page, vma))
- lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
- else
- add_page_to_unevictable_list(page);
-}
-
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_release);
/*
- * pagevec_release() for pages which are known to not be on the LRU
- *
- * This function reinitialises the caller's pagevec.
- */
-void __pagevec_release_nonlru(struct pagevec *pvec)
-{
- int i;
- struct pagevec pages_to_free;
-
- pagevec_init(&pages_to_free, pvec->cold);
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
-
- VM_BUG_ON(PageLRU(page));
- if (put_page_testzero(page))
- pagevec_add(&pages_to_free, page);
- }
- pagevec_free(&pages_to_free);
- pagevec_reinit(pvec);
-}
-
-/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
@@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec)
struct page *page = pvec->pages[i];
if (PageSwapCache(page) && trylock_page(page)) {
- if (PageSwapCache(page))
- remove_exclusive_swap_page_ref(page);
+ try_to_free_swap(page);
unlock_page(page);
}
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c90..81c825f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
- BUG_ON(!PageLocked(page));
- BUG_ON(PageSwapCache(page));
- BUG_ON(PagePrivate(page));
- BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapCache(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
error = radix_tree_preload(gfp_mask);
if (!error) {
page_cache_get(page);
@@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
- BUG_ON(!PageLocked(page));
- BUG_ON(!PageSwapCache(page));
- BUG_ON(PageWriteback(page));
- BUG_ON(PagePrivate(page));
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON(PageWriteback(page));
radix_tree_delete(&swapper_space.page_tree, page_private(page));
set_page_private(page, 0);
@@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page * page, gfp_t gfp_mask)
+int add_to_swap(struct page *page)
{
swp_entry_t entry;
int err;
- BUG_ON(!PageLocked(page));
- BUG_ON(!PageUptodate(page));
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageUptodate(page));
for (;;) {
entry = get_swap_page();
@@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
* Add it to the swap cache and mark it dirty
*/
err = add_to_swap_cache(page, entry,
- gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
switch (err) {
case 0: /* Success */
@@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page)
* If we are the only user, then try to free up the swap cache.
*
* Its ok to check for PageSwapCache without the page lock
- * here because we are going to recheck again inside
- * exclusive_swap_page() _with_ the lock.
+ * here because we are going to recheck again inside
+ * try_to_free_swap() _with_ the lock.
* - Marcelo
*/
static inline void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && trylock_page(page)) {
- remove_exclusive_swap_page(page);
+ if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
+ try_to_free_swap(page);
unlock_page(page);
}
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87..eec5ca7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/shm.h>
#include <linux/blkdev.h>
+#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -35,6 +36,7 @@
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
+long nr_swap_pages;
long total_swap_pages;
static int swap_overflow;
static int least_priority;
@@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
up_read(&swap_unplug_sem);
}
+/*
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static int discard_swap(struct swap_info_struct *si)
+{
+ struct swap_extent *se;
+ int err = 0;
+
+ list_for_each_entry(se, &si->extent_list, list) {
+ sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
+ sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+
+ if (se->start_page == 0) {
+ /* Do not discard the swap header page! */
+ start_block += 1 << (PAGE_SHIFT - 9);
+ nr_blocks -= 1 << (PAGE_SHIFT - 9);
+ if (!nr_blocks)
+ continue;
+ }
+
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL);
+ if (err)
+ break;
+
+ cond_resched();
+ }
+ return err; /* That will often be -EOPNOTSUPP */
+}
+
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+ pgoff_t start_page, pgoff_t nr_pages)
+{
+ struct swap_extent *se = si->curr_swap_extent;
+ int found_extent = 0;
+
+ while (nr_pages) {
+ struct list_head *lh;
+
+ if (se->start_page <= start_page &&
+ start_page < se->start_page + se->nr_pages) {
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ if (!found_extent++)
+ si->curr_swap_extent = se;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO))
+ break;
+ }
+
+ lh = se->list.next;
+ if (lh == &si->extent_list)
+ lh = lh->next;
+ se = list_entry(lh, struct swap_extent, list);
+ }
+}
+
+static int wait_for_discard(void *word)
+{
+ schedule();
+ return 0;
+}
+
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
static inline unsigned long scan_swap_map(struct swap_info_struct *si)
{
- unsigned long offset, last_in_cluster;
+ unsigned long offset;
+ unsigned long scan_base;
+ unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ int found_free_cluster = 0;
- /*
+ /*
* We try to cluster swap pages by allocating them sequentially
* in swap. Once we've allocated SWAPFILE_CLUSTER pages this
* way, however, we resort to first-free allocation, starting
@@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
* all over the entire swap partition, so that we reduce
* overall disk seek times between swap pages. -- sct
* But we do now try to find an empty cluster. -Andrea
+ * And we let swap pages go all over an SSD partition. Hugh
*/
si->flags += SWP_SCANNING;
- if (unlikely(!si->cluster_nr)) {
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
- goto lowest;
+ scan_base = offset = si->cluster_next;
+
+ if (unlikely(!si->cluster_nr--)) {
+ if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ goto checks;
+ }
+ if (si->flags & SWP_DISCARDABLE) {
+ /*
+ * Start range check on racing allocations, in case
+ * they overlap the cluster we eventually decide on
+ * (we scan without swap_lock to allow preemption).
+ * It's hardly conceivable that cluster_nr could be
+ * wrapped during our scan, but don't depend on it.
+ */
+ if (si->lowest_alloc)
+ goto checks;
+ si->lowest_alloc = si->max;
+ si->highest_alloc = 0;
+ }
spin_unlock(&swap_lock);
- offset = si->lowest_bit;
+ /*
+ * If seek is expensive, start searching for new cluster from
+ * start of partition, to minimize the span of allocated swap.
+ * But if seek is cheap, search from our current position, so
+ * that swap is allocated from all over the partition: if the
+ * Flash Translation Layer only remaps within limited zones,
+ * we don't want to wear out the first zone too quickly.
+ */
+ if (!(si->flags & SWP_SOLIDSTATE))
+ scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
/* Locate the first empty (unaligned) cluster */
@@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
spin_lock(&swap_lock);
- si->cluster_next = offset-SWAPFILE_CLUSTER+1;
- goto cluster;
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
+ goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
}
}
+
+ offset = si->lowest_bit;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+ /* Locate the first empty (unaligned) cluster */
+ for (; last_in_cluster < scan_base; offset++) {
+ if (si->swap_map[offset])
+ last_in_cluster = offset + SWAPFILE_CLUSTER;
+ else if (offset == last_in_cluster) {
+ spin_lock(&swap_lock);
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+
+ offset = scan_base;
spin_lock(&swap_lock);
- goto lowest;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ si->lowest_alloc = 0;
}
- si->cluster_nr--;
-cluster:
- offset = si->cluster_next;
- if (offset > si->highest_bit)
-lowest: offset = si->lowest_bit;
-checks: if (!(si->flags & SWP_WRITEOK))
+checks:
+ if (!(si->flags & SWP_WRITEOK))
goto no_page;
if (!si->highest_bit)
goto no_page;
- if (!si->swap_map[offset]) {
- if (offset == si->lowest_bit)
- si->lowest_bit++;
- if (offset == si->highest_bit)
- si->highest_bit--;
- si->inuse_pages++;
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
+ if (offset > si->highest_bit)
+ scan_base = offset = si->lowest_bit;
+ if (si->swap_map[offset])
+ goto scan;
+
+ if (offset == si->lowest_bit)
+ si->lowest_bit++;
+ if (offset == si->highest_bit)
+ si->highest_bit--;
+ si->inuse_pages++;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ }
+ si->swap_map[offset] = 1;
+ si->cluster_next = offset + 1;
+ si->flags -= SWP_SCANNING;
+
+ if (si->lowest_alloc) {
+ /*
+ * Only set when SWP_DISCARDABLE, and there's a scan
+ * for a free cluster in progress or just completed.
+ */
+ if (found_free_cluster) {
+ /*
+ * To optimize wear-levelling, discard the
+ * old data of the cluster, taking care not to
+ * discard any of its pages that have already
+ * been allocated by racing tasks (offset has
+ * already stepped over any at the beginning).
+ */
+ if (offset < si->highest_alloc &&
+ si->lowest_alloc <= last_in_cluster)
+ last_in_cluster = si->lowest_alloc - 1;
+ si->flags |= SWP_DISCARDING;
+ spin_unlock(&swap_lock);
+
+ if (offset < last_in_cluster)
+ discard_swap_cluster(si, offset,
+ last_in_cluster - offset + 1);
+
+ spin_lock(&swap_lock);
+ si->lowest_alloc = 0;
+ si->flags &= ~SWP_DISCARDING;
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+ } else if (si->flags & SWP_DISCARDING) {
+ /*
+ * Delay using pages allocated by racing tasks
+ * until the whole discard has been issued. We
+ * could defer that delay until swap_writepage,
+ * but it's easier to keep this self-contained.
+ */
+ spin_unlock(&swap_lock);
+ wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+ wait_for_discard, TASK_UNINTERRUPTIBLE);
+ spin_lock(&swap_lock);
+ } else {
+ /*
+ * Note pages allocated by racing tasks while
+ * scan for a free cluster is in progress, so
+ * that its final discard can exclude them.
+ */
+ if (offset < si->lowest_alloc)
+ si->lowest_alloc = offset;
+ if (offset > si->highest_alloc)
+ si->highest_alloc = offset;
}
- si->swap_map[offset] = 1;
- si->cluster_next = offset + 1;
- si->flags -= SWP_SCANNING;
- return offset;
}
+ return offset;
+scan:
spin_unlock(&swap_lock);
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
@@ -165,8 +355,18 @@ checks: if (!(si->flags & SWP_WRITEOK))
latency_ration = LATENCY_LIMIT;
}
}
+ offset = si->lowest_bit;
+ while (++offset < scan_base) {
+ if (!si->swap_map[offset]) {
+ spin_lock(&swap_lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
spin_lock(&swap_lock);
- goto lowest;
no_page:
si->flags -= SWP_SCANNING;
@@ -268,7 +468,7 @@ bad_nofile:
printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
out:
return NULL;
-}
+}
static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
{
@@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page)
}
/*
- * We can use this swap cache entry directly
- * if there are no other references to it.
+ * We can write to an anon page without COW if there are no other references
+ * to it. And as a side-effect, free up its swap: because the old content
+ * on disk will never be read, and seeking back there to write new content
+ * later would only waste time away from clustering.
*/
-int can_share_swap_page(struct page *page)
+int reuse_swap_page(struct page *page)
{
int count;
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageLocked(page));
count = page_mapcount(page);
- if (count <= 1 && PageSwapCache(page))
+ if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
+ if (count == 1 && !PageWriteback(page)) {
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ }
+ }
return count == 1;
}
/*
- * Work out if there are any other processes sharing this
- * swap cache page. Free it if you can. Return success.
+ * If swap is getting full, or if there are no more mappings of this page,
+ * then try_to_free_swap is called to free its swap space.
*/
-static int remove_exclusive_swap_page_count(struct page *page, int count)
+int try_to_free_swap(struct page *page)
{
- int retval;
- struct swap_info_struct * p;
- swp_entry_t entry;
-
- BUG_ON(PagePrivate(page));
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageLocked(page));
if (!PageSwapCache(page))
return 0;
if (PageWriteback(page))
return 0;
- if (page_count(page) != count) /* us + cache + ptes */
- return 0;
-
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (!p)
+ if (page_swapcount(page))
return 0;
- /* Is the only swap cache user the cache itself? */
- retval = 0;
- if (p->swap_map[swp_offset(entry)] == 1) {
- /* Recheck the page count with the swapcache lock held.. */
- spin_lock_irq(&swapper_space.tree_lock);
- if ((page_count(page) == count) && !PageWriteback(page)) {
- __delete_from_swap_cache(page);
- SetPageDirty(page);
- retval = 1;
- }
- spin_unlock_irq(&swapper_space.tree_lock);
- }
- spin_unlock(&swap_lock);
-
- if (retval) {
- swap_free(entry);
- page_cache_release(page);
- }
-
- return retval;
-}
-
-/*
- * Most of the time the page should have two references: one for the
- * process and one for the swap cache.
- */
-int remove_exclusive_swap_page(struct page *page)
-{
- return remove_exclusive_swap_page_count(page, 2);
-}
-
-/*
- * The pageout code holds an extra reference to the page. That raises
- * the reference count to test for to 2 for a page that is only in the
- * swap cache plus 1 for each process that maps the page.
- */
-int remove_exclusive_swap_page_ref(struct page *page)
-{
- return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ return 1;
}
/*
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
*/
-void free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
struct page *page = NULL;
if (is_migration_entry(entry))
- return;
+ return 1;
p = swap_info_get(entry);
if (p) {
@@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry)
spin_unlock(&swap_lock);
}
if (page) {
- int one_user;
-
- BUG_ON(PagePrivate(page));
- one_user = (page_count(page) == 2);
- /* Only cache user (+us), or swap space full? Free it! */
- /* Also recheck PageSwapCache after page is locked (above) */
+ /*
+ * Not mapped elsewhere, or swap space full? Free it!
+ * Also recheck PageSwapCache now page is locked (above).
+ */
if (PageSwapCache(page) && !PageWriteback(page) &&
- (one_user || vm_swap_full())) {
+ (!page_mapped(page) || vm_swap_full())) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
unlock_page(page);
page_cache_release(page);
}
+ return p != NULL;
}
#ifdef CONFIG_HIBERNATION
@@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type)
break;
}
- /*
+ /*
* Get a page for the entry, using the existing swap
* cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
+ * page and read the swap into it.
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
@@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type)
lock_page(page);
wait_on_page_writeback(page);
}
- if (PageSwapCache(page))
+
+ /*
+ * It is conceivable that a racing task removed this page from
+ * swap cache just before we acquired the page lock at the top,
+ * or while we dropped it in unuse_mm(). The page might even
+ * be back in swap cache on another swap area: that we must not
+ * delete, since it may not have been written out to swap yet.
+ */
+ if (PageSwapCache(page) &&
+ likely(page_private(page) == entry.val))
delete_from_swap_cache(page);
/*
@@ -1203,26 +1372,6 @@ out:
return ret;
}
-#if 0 /* We don't need this yet */
-#include <linux/backing-dev.h>
-int page_queue_congested(struct page *page)
-{
- struct backing_dev_info *bdi;
-
- BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
-
- if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
- struct swap_info_struct *sis;
-
- sis = get_swap_info_struct(swp_type(entry));
- bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
- } else
- bdi = page->mapping->backing_dev_info;
- return bdi_write_congested(bdi);
-}
-#endif
-
asmlinkage long sys_swapoff(const char __user * specialfile)
{
struct swap_info_struct * p = NULL;
@@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
char * pathname;
int i, type, prev;
int err;
-
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
spin_lock(&swap_lock);
for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
p = swap_info + type;
- if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+ if (p->flags & SWP_WRITEOK) {
if (p->swap_file->f_mapping == mapping)
break;
}
@@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v)
file = ptr->swap_file;
len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
- len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
+ len < 40 ? 40 - len : 1, " ",
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
- ptr->pages << (PAGE_SHIFT - 10),
- ptr->inuse_pages << (PAGE_SHIFT - 10),
- ptr->prio);
+ ptr->pages << (PAGE_SHIFT - 10),
+ ptr->inuse_pages << (PAGE_SHIFT - 10),
+ ptr->prio);
return 0;
}
@@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
int i, prev;
int error;
union swap_header *swap_header = NULL;
- int swap_header_version;
unsigned int nr_good_pages = 0;
int nr_extents = 0;
sector_t span;
unsigned long maxpages = 1;
- int swapfilesize;
+ unsigned long swapfilepages;
unsigned short *swap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
- swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+ swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
/*
* Read the swap header.
@@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = PTR_ERR(page);
goto bad_swap;
}
- kmap(page);
- swap_header = page_address(page);
+ swap_header = kmap(page);
- if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
- swap_header_version = 1;
- else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
- swap_header_version = 2;
- else {
+ if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
printk(KERN_ERR "Unable to find swap-space signature\n");
error = -EINVAL;
goto bad_swap;
}
-
- switch (swap_header_version) {
- case 1:
- printk(KERN_ERR "version 0 swap is no longer supported. "
- "Use mkswap -v1 %s\n", name);
+
+ /* swap partition endianess hack... */
+ if (swab32(swap_header->info.version) == 1) {
+ swab32s(&swap_header->info.version);
+ swab32s(&swap_header->info.last_page);
+ swab32s(&swap_header->info.nr_badpages);
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ swab32s(&swap_header->info.badpages[i]);
+ }
+ /* Check the swap header's sub-version */
+ if (swap_header->info.version != 1) {
+ printk(KERN_WARNING
+ "Unable to handle swap header version %d\n",
+ swap_header->info.version);
error = -EINVAL;
goto bad_swap;
- case 2:
- /* swap partition endianess hack... */
- if (swab32(swap_header->info.version) == 1) {
- swab32s(&swap_header->info.version);
- swab32s(&swap_header->info.last_page);
- swab32s(&swap_header->info.nr_badpages);
- for (i = 0; i < swap_header->info.nr_badpages; i++)
- swab32s(&swap_header->info.badpages[i]);
- }
- /* Check the swap header's sub-version and the size of
- the swap file and bad block lists */
- if (swap_header->info.version != 1) {
- printk(KERN_WARNING
- "Unable to handle swap header version %d\n",
- swap_header->info.version);
- error = -EINVAL;
- goto bad_swap;
- }
+ }
- p->lowest_bit = 1;
- p->cluster_next = 1;
+ p->lowest_bit = 1;
+ p->cluster_next = 1;
- /*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number of
- * bits for the swap offset in the swp_entry_t type and
- * 2) the number of bits in the a swap pte as defined by
- * the different architectures. In order to find the
- * largest possible bit mask a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
- */
- maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
- if (maxpages > swap_header->info.last_page)
- maxpages = swap_header->info.last_page;
- p->highest_bit = maxpages - 1;
+ /*
+ * Find out how many pages are allowed for a single swap
+ * device. There are two limiting factors: 1) the number of
+ * bits for the swap offset in the swp_entry_t type and
+ * 2) the number of bits in the a swap pte as defined by
+ * the different architectures. In order to find the
+ * largest possible bit mask a swap entry with swap type 0
+ * and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again and finally the swap
+ * offset is extracted. This will mask all the bits from
+ * the initial ~0UL mask that can't be encoded in either
+ * the swp_entry_t or the architecture definition of a
+ * swap pte.
+ */
+ maxpages = swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
+ if (maxpages > swap_header->info.last_page)
+ maxpages = swap_header->info.last_page;
+ p->highest_bit = maxpages - 1;
- error = -EINVAL;
- if (!maxpages)
- goto bad_swap;
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- goto bad_swap;
- }
- if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
- goto bad_swap;
- if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
- goto bad_swap;
+ error = -EINVAL;
+ if (!maxpages)
+ goto bad_swap;
+ if (swapfilepages && maxpages > swapfilepages) {
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
+ goto bad_swap;
+ }
+ if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+ goto bad_swap;
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ goto bad_swap;
- /* OK, set up the swap map and apply the bad block list */
- swap_map = vmalloc(maxpages * sizeof(short));
- if (!swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vmalloc(maxpages * sizeof(short));
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
- error = 0;
- memset(swap_map, 0, maxpages * sizeof(short));
- for (i = 0; i < swap_header->info.nr_badpages; i++) {
- int page_nr = swap_header->info.badpages[i];
- if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
- error = -EINVAL;
- else
- swap_map[page_nr] = SWAP_MAP_BAD;
- }
- nr_good_pages = swap_header->info.last_page -
- swap_header->info.nr_badpages -
- 1 /* header page */;
- if (error)
+ memset(swap_map, 0, maxpages * sizeof(short));
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ int page_nr = swap_header->info.badpages[i];
+ if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+ error = -EINVAL;
goto bad_swap;
+ }
+ swap_map[page_nr] = SWAP_MAP_BAD;
}
+ nr_good_pages = swap_header->info.last_page -
+ swap_header->info.nr_badpages -
+ 1 /* header page */;
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
@@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}
+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next = 1 + (random32() % p->highest_bit);
+ }
+ if (discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
else
p->prio = --least_priority;
p->swap_map = swap_map;
- p->flags = SWP_ACTIVE;
+ p->flags |= SWP_WRITEOK;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk\n",
+ "Priority:%d extents:%d across:%lluk %s%s\n",
nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+ nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+ (p->flags & SWP_DISCARDABLE) ? "D" : "");
/* insert swap space into swap_list: */
prev = -1;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d57..0000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
- *
- * Matt Mackall <mpm@selenic.com> January, 2004
- * derived from mm/shmem.c and fs/ramfs/inode.c
- *
- * This is intended for small system where the benefits of the full
- * shmem code (swap-backed and resource-limited) are outweighed by
- * their complexity. On systems without swap this code should be
- * effectively equivalent, but much lighter weight.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/ramfs.h>
-
-static struct file_system_type tmpfs_fs_type = {
- .name = "tmpfs",
- .get_sb = ramfs_get_sb,
- .kill_sb = kill_litter_super,
-};
-
-static struct vfsmount *shm_mnt;
-
-static int __init init_tmpfs(void)
-{
- BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
-
- shm_mnt = kern_mount(&tmpfs_fs_type);
- BUG_ON(IS_ERR(shm_mnt));
-
- return 0;
-}
-module_init(init_tmpfs)
-
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: vm_flags
- */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
-{
- int error;
- struct file *file;
- struct inode *inode;
- struct dentry *dentry, *root;
- struct qstr this;
-
- if (IS_ERR(shm_mnt))
- return (void *)shm_mnt;
-
- error = -ENOMEM;
- this.name = name;
- this.len = strlen(name);
- this.hash = 0; /* will go */
- root = shm_mnt->mnt_root;
- dentry = d_alloc(root, &this);
- if (!dentry)
- goto put_memory;
-
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
- error = -ENOSPC;
- inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
- if (!inode)
- goto close_file;
-
- d_instantiate(dentry, inode);
- inode->i_size = size;
- inode->i_nlink = 0; /* It is unlinked */
- init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
- &ramfs_file_operations);
-
-#ifndef CONFIG_MMU
- error = ramfs_nommu_expand_for_mapping(inode, size);
- if (error)
- goto close_file;
-#endif
- return file;
-
-close_file:
- put_filp(file);
-put_dentry:
- dput(dentry);
-put_memory:
- return ERR_PTR(error);
-}
-EXPORT_SYMBOL_GPL(shmem_file_setup);
-
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
-{
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
-
- file = shmem_file_setup("dev/zero", size, vma->vm_flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-int shmem_unuse(swp_entry_t entry, struct page *page)
-{
- return 0;
-}
-
-#ifndef CONFIG_MMU
-unsigned long shmem_get_unmapped_area(struct file *file,
- unsigned long addr,
- unsigned long len,
- unsigned long pgoff,
- unsigned long flags)
-{
- return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1ddb77b..c5db9a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,6 +14,7 @@
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -151,11 +152,12 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
-static int vmap_page_range(unsigned long addr, unsigned long end,
+static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
+ unsigned long addr = start;
int err = 0;
int nr = 0;
@@ -167,7 +169,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- flush_cache_vmap(addr, end);
+ flush_cache_vmap(start, end);
if (unlikely(err))
return err;
@@ -380,8 +382,9 @@ found:
goto retry;
}
if (printk_ratelimit())
- printk(KERN_WARNING "vmap allocation failed: "
- "use vmalloc=<size> to increase size.\n");
+ printk(KERN_WARNING
+ "vmap allocation for size %lu failed: "
+ "use vmalloc=<size> to increase size.\n", size);
return ERR_PTR(-EBUSY);
}
@@ -431,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
vunmap_page_range(va->va_start, va->va_end);
}
+static void vmap_debug_free_range(unsigned long start, unsigned long end)
+{
+ /*
+ * Unmap page tables and force a TLB flush immediately if
+ * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
+ * bugs similarly to those in linear kernel virtual address
+ * space after a page has been freed.
+ *
+ * All the lazy freeing logic is still retained, in order to
+ * minimise intrusiveness of this debugging feature.
+ *
+ * This is going to be *slow* (linear kernel virtual address
+ * debugging doesn't do a broadcast TLB flush so it is a lot
+ * faster).
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+#endif
+}
+
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
@@ -471,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
int sync, int force_flush)
{
- static DEFINE_SPINLOCK(purge_lock);
+ static DEFINE_MUTEX(purge_lock);
LIST_HEAD(valist);
struct vmap_area *va;
int nr = 0;
@@ -482,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
* the case that isn't actually used at the moment anyway.
*/
if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
+ if (!mutex_trylock(&purge_lock))
return;
} else
- spin_lock(&purge_lock);
+ mutex_lock(&purge_lock);
rcu_read_lock();
list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -517,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
__free_vmap_area(va);
spin_unlock(&vmap_area_lock);
}
- spin_unlock(&purge_lock);
+ mutex_unlock(&purge_lock);
}
/*
@@ -911,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(addr & (PAGE_SIZE-1));
debug_check_no_locks_freed(mem, size);
+ vmap_debug_free_range(addr, addr+size);
if (likely(count <= VMAP_MAX_ALLOC))
vb_free(mem, size);
@@ -1127,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->private;
struct vm_struct *tmp, **p;
+
+ vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
vm->size -= PAGE_SIZE;
@@ -1374,7 +1401,8 @@ void *vmalloc_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL, -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
area->flags |= VM_USERMAP;
@@ -1419,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+ return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ -1, __builtin_return_address(0));
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1439,7 +1468,8 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
+ return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+ -1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -1455,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
+ ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62e7f62..b07c48b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
+
/* This context's GFP mask */
gfp_t gfp_mask;
@@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
referenced && page_mapping_inuse(page))
goto activate_locked;
-#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
@@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageAnon(page) && !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- switch (try_to_munlock(page)) {
- case SWAP_FAIL: /* shouldn't happen */
- case SWAP_AGAIN:
- goto keep_locked;
- case SWAP_MLOCK:
- goto cull_mlocked;
- case SWAP_SUCCESS:
- ; /* fall thru'; add to swap cache */
- }
- if (!add_to_swap(page, GFP_ATOMIC))
+ if (!add_to_swap(page))
goto activate_locked;
may_enter_fs = 1;
}
-#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
@@ -752,6 +744,8 @@ free_it:
continue;
cull_mlocked:
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
continue;
@@ -759,7 +753,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && vm_swap_full())
- remove_exclusive_swap_page_ref(page);
+ try_to_free_swap(page);
VM_BUG_ON(PageActive(page));
SetPageActive(page);
pgactivate++;
@@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
zone->prev_priority = priority;
}
-static inline int zone_is_near_oom(struct zone *zone)
-{
- return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
-}
-
/*
* This moves pages from the active list to the inactive list.
*
@@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
list_add(&page->lru, &l_inactive);
}
+ /*
+ * Move the pages to the [file or anon] inactive list.
+ */
+ pagevec_init(&pvec, 1);
+ pgmoved = 0;
+ lru = LRU_BASE + file * LRU_FILE;
+
spin_lock_irq(&zone->lru_lock);
/*
* Count referenced pages from currently used mappings as
@@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* This helps balance scan pressure between file and anonymous
* pages in get_scan_ratio.
*/
- zone->recent_rotated[!!file] += pgmoved;
-
- /*
- * Move the pages to the [file or anon] inactive list.
- */
- pagevec_init(&pvec, 1);
+ if (scan_global_lru(sc))
+ zone->recent_rotated[!!file] += pgmoved;
- pgmoved = 0;
- lru = LRU_BASE + file * LRU_FILE;
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
- anon = zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
- file = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
- free = zone_page_state(zone, NR_FREE_PAGES);
-
/* If we have no swap space, do not bother scanning anon pages. */
if (nr_swap_pages <= 0) {
percent[0] = 0;
@@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
return;
}
+ anon = zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ file = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+ free = zone_page_state(zone, NR_FREE_PAGES);
+
/* If we have very few page cache pages, force-scan anon pages. */
if (unlikely(file + free <= zone->pages_high)) {
percent[0] = 100;
@@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long nr_reclaimed = 0;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long swap_cluster_max = sc->swap_cluster_max;
get_scan_ratio(zone, sc, percent);
@@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
}
zone->lru[l].nr_scan += scan;
nr[l] = zone->lru[l].nr_scan;
- if (nr[l] >= sc->swap_cluster_max)
+ if (nr[l] >= swap_cluster_max)
zone->lru[l].nr_scan = 0;
else
nr[l] = 0;
@@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr[l]) {
- nr_to_scan = min(nr[l],
- (unsigned long)sc->swap_cluster_max);
+ nr_to_scan = min(nr[l], swap_cluster_max);
nr[l] -= nr_to_scan;
nr_reclaimed += shrink_list(l, nr_to_scan,
- zone, sc, priority);
+ zone, sc, priority);
}
}
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is fine for the starting priority;
+ * we want to put equal scanning pressure on each zone.
+ * However, if the VM has a harder time of freeing pages,
+ * with multiple processes reclaiming pages, the total
+ * freeing target can get unreasonably large.
+ */
+ if (nr_reclaimed > swap_cluster_max &&
+ priority < DEF_PRIORITY && !current_is_kswapd())
+ break;
}
+ sc->nr_reclaimed = nr_reclaimed;
+
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
@@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
- return nr_reclaimed;
}
/*
@@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
* b) The zones may be over pages_high but they must go *over* pages_high to
* satisfy the `incremental min' zone defense algorithm.
*
- * Returns the number of reclaimed pages.
- *
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
- unsigned long nr_reclaimed = 0;
struct zoneref *z;
struct zone *zone;
@@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
priority);
}
- nr_reclaimed += shrink_zone(priority, zone, sc);
+ shrink_zone(priority, zone, sc);
}
-
- return nr_reclaimed;
}
/*
@@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
int priority;
unsigned long ret = 0;
unsigned long total_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
struct zoneref *z;
@@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (scan_global_lru(sc)) {
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
}
total_scanned += sc->nr_scanned;
- if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+ ret = sc->nr_reclaimed;
goto out;
}
@@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
}
/* top priority shrink_zones still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = nr_reclaimed;
+ ret = sc->nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
int priority;
int i;
unsigned long total_scanned;
- unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
loop_again:
total_scanned = 0;
- nr_reclaimed = 0;
+ sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
@@ -1817,11 +1812,11 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
continue;
@@ -1835,7 +1830,7 @@ loop_again:
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > nr_reclaimed + nr_reclaimed / 2)
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
}
if (all_zones_ok)
@@ -1853,7 +1848,7 @@ loop_again:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
@@ -1872,10 +1867,27 @@ out:
try_to_freeze();
+ /*
+ * Fragmentation may mean that the system cannot be
+ * rebalanced for high-order allocations in all zones.
+ * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
+ * it means the zones have been fully scanned and are still
+ * not balanced. For high-order allocations, there is
+ * little point trying all over again as kswapd may
+ * infinite loop.
+ *
+ * Instead, recheck all watermarks at order-0 as they
+ * are the most important. If watermarks are ok, kswapd will go
+ * back to sleep. High-order users can still perform direct
+ * reclaim if they wish.
+ */
+ if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+ order = sc.order = 0;
+
goto loop_again;
}
- return nr_reclaimed;
+ return sc.nr_reclaimed;
}
/*
@@ -1902,7 +1914,7 @@ static int kswapd(void *p)
};
node_to_cpumask_ptr(cpumask, pgdat->node_id);
- if (!cpus_empty(*cpumask))
+ if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
@@ -2141,7 +2153,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
pg_data_t *pgdat = NODE_DATA(nid);
node_to_cpumask_ptr(mask, pgdat->node_id);
- if (any_online_cpu(*mask) < nr_cpu_ids)
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
@@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
- unsigned long nr_reclaimed = 0;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
priority = ZONE_RECLAIM_PRIORITY;
do {
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- nr_reclaimed += slab_reclaimable -
+ sc.nr_reclaimed += slab_reclaimable -
zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- return nr_reclaimed >= nr_pages;
+ return sc.nr_reclaimed >= nr_pages;
}
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
* back onto @zone's unevictable list.
*/
#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-void scan_zone_unevictable_pages(struct zone *zone)
+static void scan_zone_unevictable_pages(struct zone *zone)
{
struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
unsigned long scan;
@@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
* that has possibly/probably made some previously unevictable pages
* evictable.
*/
-void scan_all_zones_unevictable_pages(void)
+static void scan_all_zones_unevictable_pages(void)
{
struct zone *zone;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3ccfda..9114974 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -20,7 +20,7 @@
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
{
int cpu;
int i;
@@ -43,7 +43,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
void all_vm_events(unsigned long *ret)
{
get_online_cpus();
- sum_vm_events(ret, &cpu_online_map);
+ sum_vm_events(ret, cpu_online_mask);
put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);