diff options
author | JP Abgrall <jpa@google.com> | 2012-04-24 21:45:20 -0700 |
---|---|---|
committer | JP Abgrall <jpa@google.com> | 2012-04-24 21:45:20 -0700 |
commit | 10add970b62d2276541af9e5fb0581d6d8434db4 (patch) | |
tree | afd79e8507c5fa243c044a6963072b6a6d0f059b /mm | |
parent | 9cea9c804af57f8538dc910e24c407c81e496e51 (diff) | |
parent | 66510aa1148e3457e7d46e0a2582dac7c591b95d (diff) | |
download | kernel_samsung_aries-10add970b62d2276541af9e5fb0581d6d8434db4.zip kernel_samsung_aries-10add970b62d2276541af9e5fb0581d6d8434db4.tar.gz kernel_samsung_aries-10add970b62d2276541af9e5fb0581d6d8434db4.tar.bz2 |
Merge remote-tracking branch 'common/android-3.0' into android-samsung-30-wip-mergedown
* common/android-3.0: (1178 commits)
cpufreq: interactive: remove unused target_validate_time_in_idle
cpufreq: interactive: Boost frequency on touchscreen input
cpufreq: Separate speed target revalidate time and initial set time
cpufreq: interactive: based hispeed bump on target freq, not actual
cpufreq: interactive: adjust code and documentation to match
cpufreq: interactive: configurable delay before raising above hispeed
sync: add poll support
sw_sync: add fill_driver_data support
sync: add ioctl to get fence data
sw_sync: add debug support
sync: add debugfs support
sync: add timestamps to sync_pts
sw_sync: add cpu based sync driver
sync: Add synchronization framework
Linux 3.0.28
Bluetooth: Fix l2cap conn failures for ssp devices
TOMOYO: Fix mount flags checking order.
iommu/amd: Make sure IOMMU interrupts are re-enabled on resume
cred: copy_process() should clear child->replacement_session_keyring
ASoC: ak4642: fixup: mute needs +1 step
...
Conflicts:
mm/compaction.c
Change-Id: I3dc59225d2435eddbed0c639155179e580891ac8
Signed-off-by: JP Abgrall <jpa@google.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 5 | ||||
-rw-r--r-- | mm/compaction.c | 24 | ||||
-rw-r--r-- | mm/filemap.c | 33 | ||||
-rw-r--r-- | mm/filemap_xip.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 47 | ||||
-rw-r--r-- | mm/hugetlb.c | 5 | ||||
-rw-r--r-- | mm/internal.h | 46 | ||||
-rw-r--r-- | mm/memcontrol.c | 56 | ||||
-rw-r--r-- | mm/memory.c | 18 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 21 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu-vm.c | 12 | ||||
-rw-r--r-- | mm/percpu.c | 40 | ||||
-rw-r--r-- | mm/slub.c | 9 | ||||
-rw-r--r-- | mm/sparse.c | 30 | ||||
-rw-r--r-- | mm/swap.c | 85 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmalloc.c | 67 |
23 files changed, 378 insertions, 156 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e56fe35..b3b122f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -686,6 +686,14 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); + /* + * If bdi_unregister() had already been called earlier, the + * wakeup_timer could still be armed because bdi_prune_sb() + * can race with the bdi_wakeup_thread_delayed() calls from + * __mark_inode_dirty(). + */ + del_timer_sync(&bdi->wb.wakeup_timer); + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); diff --git a/mm/bootmem.c b/mm/bootmem.c index 01d5a4b3..9686c4e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -768,14 +768,13 @@ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { bootmem_data_t *bdata; - unsigned long pfn, goal, limit; + unsigned long pfn, goal; pfn = section_nr_to_pfn(section_nr); goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); } #endif diff --git a/mm/compaction.c b/mm/compaction.c index 642e456..1175c24 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -330,17 +330,39 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } else if (!locked) spin_lock_irq(&zone->lru_lock); + /* + * migrate_pfn does not necessarily start aligned to a + * pageblock. Ensure that pfn_valid is called when moving + * into a new MAX_ORDER_NR_PAGES range in case of large + * memory holes within the zone + */ + if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(low_pfn)) { + low_pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + } + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; - /* Get the page and skip if free */ + /* + * Get the page and ensure the page is within the same zone. + * See the comment in isolate_freepages about overlapping + * nodes. It is deliberate that the new zone lock is not taken + * as memory compaction should not move pages between nodes. + */ page = pfn_to_page(low_pfn); /* Watch for unexpected holes punched in the memmap */ if (!memmap_valid_within(low_pfn, page, zone)) continue; + if (page_zone(page) != zone) + continue; + + /* Skip if free */ if (PageBuddy(page)) continue; diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8..b7d8603 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - struct mem_cgroup *memcg = NULL; VM_BUG_ON(!PageLocked(old)); VM_BUG_ON(!PageLocked(new)); VM_BUG_ON(new->mapping); - /* - * This is not page migration, but prepare_migration and - * end_migration does enough work for charge replacement. - * - * In the longer term we probably want a specialized function - * for moving the charge from old to new in a more efficient - * manner. - */ - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); - if (error) - return error; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { struct address_space *mapping = old->mapping; @@ -435,13 +422,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); radix_tree_preload_end(); if (freepage) freepage(old); page_cache_release(old); - mem_cgroup_end_migration(memcg, old, new, true); - } else { - mem_cgroup_end_migration(memcg, old, new, false); } return error; @@ -1393,15 +1379,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; - struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; - blk_start_plug(&plug); - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1417,8 +1400,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { + struct blk_plug plug; + + blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); + blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -1474,7 +1461,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: - blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -1807,7 +1793,7 @@ repeat: page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) @@ -1904,10 +1890,7 @@ static struct page *wait_on_page_read(struct page *page) * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with - * any new page allocations done using the specified allocation flags. Note - * that the Radix tree operations will still use GFP_KERNEL, so you can't - * expect to do this atomically or anything like that - but you can pass in - * other page requirements. + * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. */ diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 93356cd..dee9429 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -263,7 +263,12 @@ found: xip_pfn); if (err == -ENOMEM) return VM_FAULT_OOM; - BUG_ON(err); + /* + * err == -EBUSY is fine, we've raced against another thread + * that faulted-in the same page + */ + if (err != -EBUSY) + BUG_ON(err); return VM_FAULT_NOPAGE; } else { int err, ret = VM_FAULT_OOM; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f2..8cc11dd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -641,6 +641,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm->nr_ptes++; spin_unlock(&mm->page_table_lock); } @@ -759,6 +760,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); prepare_pmd_huge_pte(pgtable, dst_mm); + dst_mm->nr_ptes++; ret = 0; out_unlock: @@ -857,7 +859,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); @@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1016,6 +1017,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); pte_free(tlb->mm, pgtable); @@ -1156,6 +1158,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1164,11 +1167,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1186,10 +1205,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1206,7 +1222,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1223,6 +1238,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); @@ -1295,7 +1312,6 @@ static int __split_huge_page_map(struct page *page, pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1910,7 +1926,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, entry); prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; spin_unlock(&mm->page_table_lock); #ifndef CONFIG_NUMA @@ -2005,7 +2020,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_test_exit(mm)) { /* free mm_slot */ @@ -2033,7 +2048,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153..f9c5849 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -575,6 +575,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -900,7 +901,6 @@ retry: h->resv_huge_pages += delta; ret = 0; - spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) @@ -914,6 +914,7 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } + spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ free: @@ -2415,6 +2416,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { + page_cache_release(new_page); + page_cache_release(old_page); /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; diff --git a/mm/internal.h b/mm/internal.h index d071d38..2189af4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59ac5d6..283068f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3422,6 +3422,50 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, return ret; } +/* + * At replace page cache, newpage is not under any memcg but it's on + * LRU. So, this function doesn't touch res_counter but handles LRU + * in correct way. Both pages are locked so we cannot race with uncharge. + */ +void mem_cgroup_replace_page_cache(struct page *oldpage, + struct page *newpage) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + struct zone *zone; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + /* fix accounting on old pages */ + lock_page_cgroup(pc); + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + ClearPageCgroupUsed(pc); + unlock_page_cgroup(pc); + + if (PageSwapBacked(oldpage)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + zone = page_zone(newpage); + pc = lookup_page_cgroup(newpage); + /* + * Even if newpage->mapping was NULL before starting replacement, + * the newpage may be on LRU(or pagevec for LRU) already. We lock + * LRU while we overwrite pc->mem_cgroup. + */ + spin_lock_irqsave(&zone->lru_lock, flags); + if (PageLRU(newpage)) + del_page_from_lru_list(zone, newpage, page_lru(newpage)); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type); + if (PageLRU(newpage)) + add_page_to_lru_list(zone, newpage, page_lru(newpage)); + spin_unlock_irqrestore(&zone->lru_lock, flags); +} + #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -4514,6 +4558,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, */ BUG_ON(!thresholds); + if (!thresholds->primary) + goto unlock; + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); /* Check if a threshold crossed before removing */ @@ -4562,7 +4609,7 @@ swap_buffers: /* To be sure that nobody uses thresholds */ synchronize_rcu(); - +unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -4963,9 +5010,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = mem; if (mem_cgroup_soft_limit_tree_init()) goto free_out; + root_mem_cgroup = mem; for_each_possible_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); @@ -5004,7 +5051,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); - root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -5244,6 +5290,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) @@ -5405,6 +5453,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index d961e19..d49b58a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1228,16 +1228,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { + if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd)) - continue; + goto next; /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -1514,7 +1522,7 @@ split_fallthrough: } if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d2..a85171d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { next = pmd_addr_end(addr, end); split_huge_page_pmd(vma->vm_mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, flags, private)) diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d..117ff54 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -161,7 +161,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else mincore_pte_range(vma, pmd, addr, next, vec); @@ -697,9 +697,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* add the VMA to the tree */ @@ -761,9 +763,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); } /* remove from the MM's tree and list */ @@ -776,8 +780,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) if (vma->vm_next) vma->vm_next->vm_prev = vma->vm_prev; - - vma->vm_mm = NULL; } /* @@ -2061,6 +2063,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; down_write(&nommu_region_sem); + mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, @@ -2068,6 +2071,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return -ETXTBSY; /* not quite true, but near enough */ } @@ -2095,6 +2099,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, } } + mutex_unlock(&inode->i_mapping->i_mmap_mutex); up_write(&nommu_region_sem); return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8093fc7..7c72487 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { - int points; + long points; if (oom_unkillable_task(p, mem, nodemask)) return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03d8c48..e2f474d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -370,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } @@ -3411,9 +3411,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) unsigned long block_migratetype; int reserve; - /* Get the start pfn, end pfn and the number of blocks to reserve */ + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; + start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; @@ -5582,6 +5588,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) bool is_pageblock_removable_nolock(struct page *page) { struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + */ + if (!zone || zone->zone_start_pfn > pfn || + zone->zone_start_pfn + zone->spanned_pages <= pfn) + return false; + return __count_immobile_pages(zone, page, 0); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d5..87eac0e 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea53496..bfad724 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55..0ae7a09 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; -/* cpus with the lowest and highest unit numbers */ -static unsigned int pcpu_first_unit_cpu __read_mostly; -static unsigned int pcpu_last_unit_cpu __read_mostly; +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; @@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; - unsigned long first_start, first_end; + unsigned long first_low, first_high; unsigned int cpu; /* - * The following test on first_start/end isn't strictly + * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. */ - first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); - first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, - pcpu_unit_pages); - if ((unsigned long)addr >= first_start && - (unsigned long)addr < first_end) { + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); @@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) if (!is_vmalloc_addr(addr)) return __pa(addr); else - return page_to_phys(vmalloc_to_page(addr)); + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); } else - return page_to_phys(pcpu_addr_to_page(addr)); + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); } /** @@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; - pcpu_first_unit_cpu = NR_CPUS; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; @@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; - if (pcpu_first_unit_cpu == NR_CPUS) - pcpu_first_unit_cpu = cpu; - pcpu_last_unit_cpu = cpu; + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; @@ -1818,6 +1818,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; + /* must check again c->freelist in case of cpu migration or IRQ */ + object = c->freelist; + if (object) + goto update_freelist; + stat(s, ALLOC_REFILL); load_freelist: @@ -1827,6 +1832,7 @@ load_freelist: if (kmem_cache_debug(s)) goto debug; +update_freelist: c->freelist = get_freepointer(s, object); page->inuse = page->objects; page->freelist = NULL; @@ -3433,13 +3439,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); + up_write(&slub_lock); if (sysfs_slab_add(s)) { + down_write(&slub_lock); list_del(&s->list); kfree(n); kfree(s); goto err; } - up_write(&slub_lock); return s; } kfree(n); diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12..4cd05e5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), usemap_count); - if (usemap) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; + if (!usemap) { + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (!usemap) { + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; } - return; } - usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); - if (usemap) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); - } - return; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); } - - printk(KERN_WARNING "%s: allocation failed\n", __func__); } #ifndef CONFIG_SPARSEMEM_VMEMMAP @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) { if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ - struct page *page_head = page->first_page; - smp_rmb(); - /* - * If PageTail is still set after smp_rmb() we can be sure - * that the page->first_page we read wasn't a dangling pointer. - * See __split_huge_page_refcount() smp_wmb(). - */ - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { unsigned long flags; /* - * Verify that our page_head wasn't converted - * to a a regular page before we got a - * reference on it. + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. */ - if (unlikely(!PageHead(page_head))) { - /* PageHead is cleared after PageTail */ - smp_rmb(); - VM_BUG_ON(PageTail(page)); - goto out_put_head; - } - /* - * Only run compound_lock on a valid PageHead, - * after having it pinned with - * get_page_unless_zero() above. - */ - smp_mb(); - /* page_head wasn't a dangling pointer */ flags = compound_lock_irqsave(page_head); if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); VM_BUG_ON(PageHead(page_head)); - out_put_head: if (put_page_testzero(page_head)) __put_single_page(page_head); out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) VM_BUG_ON(page_head != page->first_page); /* * We can release the refcount taken by - * get_page_unless_zero now that - * split_huge_page_refcount is blocked on the - * compound_lock. + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on + * the compound_lock. */ if (put_page_testzero(page_head)) VM_BUG_ON(1); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_dec(&page->_count); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got = false; + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru @@ -644,7 +667,7 @@ void lru_add_page_tail(struct zone* zone, VM_BUG_ON(!PageHead(page)); VM_BUG_ON(PageCompound(page_tail)); VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); SetPageLRU(page_tail); diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a..c8f4338 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (unlikely(pmd_trans_huge(*pmd))) - continue; - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 45ece89..43b44db 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1267,18 +1267,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, void *caller) { - struct vm_struct *tmp, **p; - vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->private = vm; va->flags |= VM_VM_AREA; +} + +static void insert_vmalloc_vmlist(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + vm->flags &= ~VM_UNLIST; write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) @@ -1289,6 +1293,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, write_unlock(&vmlist_lock); } +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + setup_vmalloc_vm(vm, va, flags, caller); + insert_vmalloc_vmlist(vm); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) @@ -1327,7 +1338,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - insert_vmalloc_vm(area, va, flags, caller); + /* + * When this function is called from __vmalloc_node_range, + * we do not add vm_struct to vmlist here to avoid + * accessing uninitialized members of vm_struct such as + * pages and nr_pages fields. They will be set later. + * To distinguish it from others, we use a VM_UNLIST flag. + */ + if (flags & VM_UNLIST) + setup_vmalloc_vm(area, va, flags, caller); + else + insert_vmalloc_vm(area, va, flags, caller); + return area; } @@ -1395,17 +1417,20 @@ struct vm_struct *remove_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to this vm_struct - * before unmap. (address range confliction is maintained by - * vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); + + if (!(vm->flags & VM_UNLIST)) { + struct vm_struct *tmp, **p; + /* + * remove from list and disallow access to + * this vm_struct before unmap. (address range + * confliction is maintained by vmap.) + */ + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + } vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); @@ -1616,13 +1641,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, - gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + start, end, node, gfp_mask, caller); if (!area) return NULL; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + if (!addr) + return NULL; + + /* + * In this function, newly allocated vm_struct is not added + * to vmlist at __get_vm_area_node(). so, it is added here. + */ + insert_vmalloc_vmlist(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area |