diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 68 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 42 | ||||
-rw-r--r-- | mm/highmem.c | 62 | ||||
-rw-r--r-- | mm/hugetlb.c | 5 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 1 | ||||
-rw-r--r-- | mm/memory.c | 32 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 48 | ||||
-rw-r--r-- | mm/mempolicy.c | 15 | ||||
-rw-r--r-- | mm/migrate.c | 17 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 49 | ||||
-rw-r--r-- | mm/oom_kill.c | 33 | ||||
-rw-r--r-- | mm/page-writeback.c | 31 | ||||
-rw-r--r-- | mm/page_alloc.c | 99 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 49 | ||||
-rw-r--r-- | mm/vmalloc.c | 56 | ||||
-rw-r--r-- | mm/vmscan.c | 218 | ||||
-rw-r--r-- | mm/vmstat.c | 44 |
23 files changed, 616 insertions, 274 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 65d4204..f2eb278 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); /* @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/dmapool.c b/mm/dmapool.c index 3df0637..4df2de7 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44..75572b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + return ret | VM_FAULT_RETRY; + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } out: return written; diff --git a/mm/highmem.c b/mm/highmem.c index 7a0aa1b..781e754 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -42,6 +42,10 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +DEFINE_PER_CPU(int, __kmap_atomic_idx); +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -422,61 +426,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type) -{ - static int warn_count = 10; - - if (unlikely(warn_count < 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_nmi()) { - if (type != KM_NMI && type != KM_NMI_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || - type == KM_IRQ_PTE || type == KM_NMI || - type == KM_NMI_PTE ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#ifdef CONFIG_KGDB_KDB - if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { - WARN_ON(1); - warn_count--; - } -#endif /* CONFIG_KGDB_KDB */ -} - -#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96991de..c4a3558 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2448,8 +2448,11 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; + } copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); diff --git a/mm/internal.h b/mm/internal.h index 6a697bb..dedb0af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44a8cef..1243241 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1292,6 +1292,7 @@ static int soft_offline_huge_page(struct page *page, int flags) list_add(&hpage->lru, &pagelist); ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/memory.c b/mm/memory.c index af82741..02e48aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -736,7 +736,7 @@ again: dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); + src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; @@ -767,7 +767,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(orig_src_pte); + pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -1591,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); @@ -2080,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else @@ -2108,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) { struct page *old_page, *new_page; pte_t entry; @@ -2627,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; + int locked; struct mem_cgroup *ptr = NULL; int exclusive = 0; int ret = 0; @@ -2677,8 +2679,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release; } - lock_page(page); + locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } /* * Make sure try_to_free_swap or reuse_swap_page or swapoff did not @@ -2927,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) return ret; if (unlikely(PageHWPoison(vmf.page))) { @@ -3344,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -static int follow_pte(struct mm_struct *mm, unsigned long address, +static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; @@ -3381,6 +3388,17 @@ out: return -EINVAL; } +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4e940a..9260314 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page) /* Checks if this range of memory is likely to be hot-removable. */ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - int type; struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { - type = get_pageblock_migratetype(page); - - /* - * A pageblock containing MOVABLE or free pages is considered - * removable - */ - if (type != MIGRATE_MOVABLE && !pageblock_free(page)) - return 0; - - /* - * A pageblock starting with a PageReserved page is not - * considered removable. - */ - if (PageReserved(page)) + if (!is_pageblock_removable_nolock(page)) return 0; + cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ @@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -int scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) page_is_file_cache(page)); } else { - /* Becasue we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) - not_managed++; #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); dump_page(page); #endif + /* Becasue we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) { + not_managed++; + ret = -EBUSY; + break; + } } } - ret = -EBUSY; - if (not_managed) { - if (!list_empty(&source)) + if (!list_empty(&source)) { + if (not_managed) { + putback_lru_pages(&source); + goto out; + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + if (ret) putback_lru_pages(&source); - goto out; } - ret = 0; - if (list_empty(&source)) - goto out; - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); - out: return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f969da5..81a1276 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (IS_ERR(vma)) + return PTR_ERR(vma); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, 0); + if (err) + putback_lru_pages(&pagelist); + } return err; } @@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len, err = mbind_range(mm, start, end, new); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, 0); + if (nr_failed) + putback_lru_pages(&pagelist); + } if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; diff --git a/mm/migrate.c b/mm/migrate.c index f8c9bcc..fe5a3c6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; @@ -884,8 +883,9 @@ out: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. All pages will be - * returned to the LRU or freed. + * or no retryable pages exist anymore. + * Caller should call putback_lru_pages to return pages to the LRU + * or free list. * * Return: Number of pages not migrated or error code. */ @@ -932,8 +932,6 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - putback_lru_pages(from); - if (rc) return rc; @@ -1039,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = -EFAULT; vma = find_vma(mm, pp->addr); - if (!vma || !vma_migratable(vma)) + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); @@ -1088,9 +1086,12 @@ set_status: } err = 0; - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm, 0); + if (err) + putback_lru_pages(&pagelist); + } up_read(&mm->mmap_sem); return err; @@ -1203,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma) + if (!vma || addr < vma->vm_start) goto set_status; page = follow_page(vma, addr, 0); diff --git a/mm/mremap.c b/mm/mremap.c index cde56ee..563fbdd 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_sem prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_pte = pte_offset_map(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); + pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -293,11 +293,58 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } -EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4029583..7dcca55 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, return 0; /* - * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't - * need to be executed for something that cannot be killed. + * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN + * so the entire heuristic doesn't need to be executed for something + * that cannot be killed. */ - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (atomic_read(&p->mm->oom_disable_count)) { task_unlock(p); return 0; } @@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, #define K(x) ((x) << (PAGE_SHIFT-10)) static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { + struct task_struct *q; + struct mm_struct *mm; + p = find_lock_task_mm(p); if (!p) return 1; + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), K(get_mm_counter(p->mm, MM_ANONPAGES)), K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); + /* + * Kill all processes sharing p->mm in other thread groups, if any. + * They don't get access to memory reserves or a higher scheduler + * priority, though, to avoid depletion of all memory or task + * starvation. This prevents mm->mmap_sem livelock when an oom killed + * task cannot exit because it requires the semaphore and its contended + * by another thread trying to allocate memory itself. That thread will + * now get access to memory reserves since it has a pending fatal + * signal. + */ + for_each_process(q) + if (q->mm == mm && !same_thread_group(q, p)) { + task_lock(q); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(q), q->comm); + task_unlock(q); + force_sig(SIGKILL, q); + } set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); @@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - (current->signal->oom_adj != OOM_DISABLE)) { + current->mm && !atomic_read(¤t->mm->oom_disable_count)) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e3bccac..b840afa 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping, * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; @@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping, * the last resort safeguard. */ dirty_exceeded = - (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) - || (nr_reclaimable + nr_writeback >= dirty_thresh); + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); if (!dirty_exceeded) break; @@ -1121,6 +1115,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); @@ -1129,6 +1124,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); +} +EXPORT_SYMBOL(account_page_writeback); + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -1366,7 +1373,7 @@ int test_set_page_writeback(struct page *page) ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a362c5..07a6544 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -531,7 +531,7 @@ static inline void __free_one_page(struct page *page, * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; combined_idx = __find_combined_index(page_idx, order); higher_page = page + combined_idx - page_idx; @@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1932,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) const gfp_t wait = gfp_mask & __GFP_WAIT; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ - BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller @@ -1940,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { alloc_flags |= ALLOC_HARDER; @@ -2095,7 +2095,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -5297,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * page allocater never alloc memory from ISOLATE block. */ +static int +__count_immobile_pages(struct zone *zone, struct page *page, int count) +{ + unsigned long pfn, iter, found; + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains immobile pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return true; + + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + return true; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) { + iter++; + continue; + } + page = pfn_to_page(check); + if (!page_count(page)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check it. + * But now, memory offline itself doesn't call shrink_slab() + * and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return false; + } + return true; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone = page_zone(page); + return __count_immobile_pages(zone, page, 0); +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; - struct page *curr_page; - unsigned long flags, pfn, iter; - unsigned long immobile = 0; + unsigned long flags, pfn; struct memory_isolate_notify arg; int notifier_ret; int ret = -EBUSY; @@ -5312,11 +5365,6 @@ int set_migratetype_isolate(struct page *page) zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || - zone_idx == ZONE_MOVABLE) { - ret = 0; - goto out; - } pfn = page_to_pfn(page); arg.start_pfn = pfn; @@ -5336,23 +5384,20 @@ int set_migratetype_isolate(struct page *page) */ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret || !arg.pages_found) + if (notifier_ret) goto out; - - for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { - if (!pfn_valid_within(pfn)) - continue; - - curr_page = pfn_to_page(iter); - if (!page_count(curr_page) || PageLRU(curr_page)) - continue; - - immobile++; - } - - if (arg.pages_found == immobile) + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (__count_immobile_pages(zone, page, arg.pages_found)) ret = 0; + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + out: if (!ret) { set_pageblock_migratetype(page, MIGRATE_ISOLATE); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e0ffd9..4ae42bb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 0 if all pages in the range is isolated. + * Returns 1 if all pages in the range is isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) @@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) struct zone *zone; int ret; - pfn = start_pfn; /* * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page * is not aligned to pageblock_nr_pages. @@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void) return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); } -void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } @@ -314,7 +314,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *__page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; @@ -348,6 +348,8 @@ out: } void page_unlock_anon_vma(struct anon_vma *anon_vma) + __releases(&anon_vma->root->lock) + __releases(RCU) { anon_vma_unlock(anon_vma); rcu_read_unlock(); @@ -407,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * * On success returns with pte mapped and locked. */ -pte_t *page_check_address(struct page *page, struct mm_struct *mm, +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; @@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max) { /* Figure out how many entries to transfer */ - int nr = min(min(from->avail, max), to->limit - to->avail); + int nr = min3(from->avail, max, to->limit - to->avail); if (!nr) return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fc7bac..67ddaaf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -30,6 +30,7 @@ #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> +#include <linux/poll.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } filp_close(swap_file, NULL); err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); @@ -1688,6 +1695,25 @@ out: } #ifdef CONFIG_PROC_FS +struct proc_swaps { + struct seq_file seq; + int event; +}; + +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct proc_swaps *s = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (s->event != atomic_read(&proc_poll_event)) { + s->event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { @@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - return seq_open(file, &swaps_op); + struct proc_swaps *s; + int ret; + + s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); + if (!s) + return -ENOMEM; + + file->private_data = s; + + ret = seq_open(file, &swaps_op); + if (ret) { + kfree(s); + return ret; + } + + s->seq.private = s; + s->event = atomic_read(&proc_poll_event); + return ret; } static const struct file_operations proc_swaps_operations = { @@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .poll = swaps_poll, }; static int __init procswaps_init(void) @@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + error = 0; goto out; bad_swap: diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9f90962..a3d66b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va) struct rb_node *tmp; while (*p) { - struct vmap_area *tmp; + struct vmap_area *tmp_va; parent = *p; - tmp = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp->va_end) + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; - else if (va->va_end > tmp->va_start) + else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); @@ -1596,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1607,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, - -1, __builtin_return_address(0)); + return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); /** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, -1, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * @@ -1653,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -2350,6 +2392,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmlist_lock) { loff_t n = *pos; struct vm_struct *v; @@ -2376,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmlist_lock) { read_unlock(&vmlist_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index b94c946..b8a6fdc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,6 +51,12 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> +enum lumpy_mode { + LUMPY_MODE_NONE, + LUMPY_MODE_ASYNC, + LUMPY_MODE_SYNC, +}; + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -82,7 +88,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - bool lumpy_reclaim_mode; + enum lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, + bool sync) +{ + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + + /* + * Some reclaim have alredy been failed. No worth to try synchronous + * lumpy reclaim. + */ + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = mode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = mode; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + +static void disable_lumpy_reclaim_mode(struct scan_control *sc) +{ + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -330,7 +365,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -366,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -376,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; @@ -394,7 +428,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -402,7 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sync_writeback)); + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -580,7 +615,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) return PAGEREF_RECLAIM; /* @@ -616,7 +651,7 @@ static enum page_references page_check_references(struct page *page, } /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page) + if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; @@ -644,12 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct zone *zone, + struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; cond_resched(); @@ -669,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -694,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } references = page_check_references(page, sc); @@ -743,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { + nr_dirty++; + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -751,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -841,6 +888,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + disable_lumpy_reclaim_mode(sc); continue; activate_locked: @@ -853,10 +901,21 @@ activate_locked: keep_locked: unlock_page(page); keep: + disable_lumpy_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + free_page_list(&free_pages); list_splice(&ret_pages, page_list); @@ -1006,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; + break; /* * If we don't have enough swap space, reclaiming of @@ -1014,8 +1073,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * pointless. */ if (nr_swap_pages <= 0 && PageAnon(cursor_page) && - !PageSwapCache(cursor_page)) - continue; + !PageSwapCache(cursor_page)) + break; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); @@ -1026,11 +1085,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, nr_lumpy_dirty++; scan++; } else { - if (mode == ISOLATE_BOTH && - page_count(cursor_page)) - nr_lumpy_failed++; + /* the page is freed already. */ + if (!page_count(cursor_page)) + continue; + break; } } + + /* If we break out of the loop above, lumpy reclaim failed */ + if (pfn < end_pfn) + nr_lumpy_failed++; } *scanned = scan; @@ -1253,7 +1317,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (!sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1286,7 +1350,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_active; unsigned long nr_anon; unsigned long nr_file; @@ -1298,15 +1361,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - + set_lumpy_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1318,8 +1381,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, sc->mem_cgroup, 0, file); /* @@ -1337,20 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, NULL); - count_vm_events(PGDEACTIVATE, nr_active); - - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + set_lumpy_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); } local_irq_disable(); @@ -1359,6 +1414,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->lumpy_reclaim_mode)); return nr_reclaimed; } @@ -1506,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_unlock_irq(&zone->lru_lock); } +#ifdef CONFIG_SWAP static int inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1531,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) { int low; + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } +#else +static inline int inactive_anon_is_low(struct zone *zone, + struct scan_control *sc) +{ + return 0; +} +#endif static int inactive_file_is_low_global(struct zone *zone) { @@ -1721,21 +1797,6 @@ out: } } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) -{ - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = 1; - else - sc->lumpy_reclaim_mode = 0; -} - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1750,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone, get_scan_count(zone, sc, nr, priority); - set_lumpy_reclaim_mode(priority, sc); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { @@ -1782,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1937,21 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; + + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + NULL, &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); + } } out: - /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. - */ - if (priority < 0) - priority = 0; - delayacct_freepages_end(); put_mems_allowed(); @@ -2247,6 +2301,15 @@ loop_again: if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); } } @@ -2987,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA /* * per node 'scan_unevictable_pages' attribute. On demand re-scan of * a specified node's per zone unevictable lists for evictable pages. @@ -3033,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node) { sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } - +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 355a9e6..cd2e42b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -17,6 +17,8 @@ #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> +#include <linux/writeback.h> +#include <linux/compaction.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -394,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif #ifdef CONFIG_COMPACTION + struct contig_page_info { unsigned long free_pages; unsigned long free_blocks_total; @@ -745,6 +748,11 @@ static const char * const vmstat_text[] = { "nr_isolated_anon", "nr_isolated_file", "nr_shmem", + "nr_dirtied", + "nr_written", + "nr_dirty_threshold", + "nr_dirty_background_threshold", + #ifdef CONFIG_NUMA "numa_hit", "numa_miss", @@ -904,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = { .release = seq_release, }; +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; -#ifdef CONFIG_VM_EVENT_COUNTERS - unsigned long *e; -#endif - int i; + int i, stat_items_size; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); + stat_items_size += sizeof(struct vm_event_state); #endif + + v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; #endif - return v + *pos; + return m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) |