From b83a8e64fd1aecf021111d22c062c97a3241d0c4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:55 +0200 Subject: [PATCH] MM: Remove rogue readahead printk For some reason it triggers always with NFS root and spams the kernel logs of my nfs root boxes a lot. Signed-off-by: Andi Kleen Acked-by: Trond Myklebust Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d087fc3..b9a60c4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp, return; ra->ra_pages /= 4; - printk(KERN_WARNING "Reducing readahead size to %luK\n", - ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); } /** -- cgit v1.1 From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:35 -0700 Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu notifications Few of the callback functions and notifier blocks that are associated with cpu notifications incorrectly have __devinit and __devinitdata. They should be __cpuinit and __cpuinitdata instead. It makes no functional difference but wastes text area when CONFIG_HOTPLUG is enabled and CONFIG_HOTPLUG_CPU is not. This patch fixes all those instances. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0f20843..252972f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int __devinit cpuup_callback(struct notifier_block *nfb, +static int __cpuinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; -- cgit v1.1 From b8008b2bc21fb13b45964e21247f18c013d6e985 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Sun, 30 Jul 2006 03:04:04 -0700 Subject: [PATCH] Fix kmem_cache_alloc() been documented twice kmem_cache_alloc() was documented twice, but kmem_cache_zalloc() never. Fix this obvious typo to get things right. Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 252972f..21ba060 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3224,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * kmem_cache_zalloc - Allocate an object. The memory is set to zero. * @cache: The cache to allocate from. * @flags: See kmalloc(). * -- cgit v1.1 From 60c371bc753495f36d3a71338b46030f7fffce3b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 5 Aug 2006 12:14:25 -0700 Subject: [PATCH] fadvise() make POSIX_FADV_NOREUSE a no-op The POSIX_FADV_NOREUSE hint means "the application will use this range of the file a single time". It seems to be intended that the implementation will use this hint to perform drop-behind of that part of the file when the application gets around to reading or writing it. However for reasons which aren't obvious (or sane?) I mapped POSIX_FADV_NOREUSE onto POSIX_FADV_WILLNEED. ie: it does readahead. That's daft. So for now, make POSIX_FADV_NOREUSE a no-op. This is a non-back-compatible change. If someone was using POSIX_FADV_NOREUSE to perform readahead, they lose. The likelihood is low. If/when we later implement POSIX_FADV_NOREUSE things will get interesting - to do it fully we'll need to maintain file offset/length ranges and peform all sorts of complex tricks, and managing the lifetime of those ranges' data structures will be interesting.. A sensible implementation would probably ignore the file range and would simply mark the entire file as needing some form of drop-behind treatment. Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 60a5d55..168c78a 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) file->f_ra.ra_pages = bdi->ra_pages * 2; break; case POSIX_FADV_WILLNEED: - case POSIX_FADV_NOREUSE: if (!mapping->a_ops->readpage) { ret = -EINVAL; break; @@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (ret > 0) ret = 0; break; + case POSIX_FADV_NOREUSE: + break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) filemap_flush(mapping); -- cgit v1.1 From 6f712711dbd180aa3777efe5ae3b9b0e915b9471 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:14:58 -0700 Subject: [PATCH] memory hotadd fixes: not-aligned memory hotadd handling fix ioresouce handling code in memory hotplug allows not-aligned memory hot add. But when memmap and other memory structures are initialized, parameters should be aligned. (if not aligned, initialization of mem_map will do wrong, it assumes parameters are aligned.) This patch fix it. And this patch allows ioresource collision check to handle -EEXIST. Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01c9fb9..7d25cc1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -76,15 +76,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, { unsigned long i; int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { - err = __add_section(zone, phys_start_pfn + i); + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(zone, i << PFN_SECTION_SHIFT); - /* We want to keep adding the rest of the - * sections if the first ones already exist + /* + * EEXIST is finally dealed with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. */ if (err && (err != -EEXIST)) break; + err = 0; } return err; @@ -213,10 +220,10 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } /* add this memory to iomem resource */ -static void register_memory_resource(u64 start, u64 size) +static int register_memory_resource(u64 start, u64 size) { struct resource *res; - + int ret = 0; res = kzalloc(sizeof(struct resource), GFP_KERNEL); BUG_ON(!res); @@ -228,7 +235,9 @@ static void register_memory_resource(u64 start, u64 size) printk("System RAM resource %llx - %llx cannot be added\n", (unsigned long long)res->start, (unsigned long long)res->end); kfree(res); + ret = -EEXIST; } + return ret; } @@ -269,7 +278,7 @@ int add_memory(int nid, u64 start, u64 size) } /* register this memory as resource */ - register_memory_resource(start, size); + ret = register_memory_resource(start, size); return ret; error: -- cgit v1.1 From 58c1b5b079071d82b2f924000b7e8fb5585ce7d8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:15:01 -0700 Subject: [PATCH] memory hotadd fixes: find_next_system_ram catch range fix find_next_system_ram() is used to find available memory resource at onlining newly added memory. This patch fixes following problem. find_next_system_ram() cannot catch this case. Resource: (start)-------------(end) Section : (start)-------------(end) Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7d25cc1..26f1840 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -163,7 +163,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) res.flags = IORESOURCE_MEM; /* we just need system ram */ section_end = res.end; - while (find_next_system_ram(&res) >= 0) { + while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); nr_pages = (unsigned long) ((res.end + 1 - res.start) >> PAGE_SHIFT); -- cgit v1.1 From ebd15302dc0ba1b8761600c20854f5371e7bae1e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:15:06 -0700 Subject: [PATCH] memory hotadd fixes: enhance collision check This patch is for collision check enhancement for memory hot add. It's better to do resouce collision check before doing memory hot add, which will touch memory management structures. And add_section() should check section exists or not before calling sparse_add_one_section(). (sparse_add_one_section() will do another check anyway. but checking in memory_hotplug.c will be easy to understand.) Signed-off-by: KAMEZAWA Hiroyuki Cc: keith mannthey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 26f1840..c373195 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); if (ret < 0) @@ -220,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } /* add this memory to iomem resource */ -static int register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; - int ret = 0; res = kzalloc(sizeof(struct resource), GFP_KERNEL); BUG_ON(!res); @@ -235,9 +237,18 @@ static int register_memory_resource(u64 start, u64 size) printk("System RAM resource %llx - %llx cannot be added\n", (unsigned long long)res->start, (unsigned long long)res->end); kfree(res); - ret = -EEXIST; + res = NULL; } - return ret; + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; } @@ -246,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; int new_pgdat = 0; + struct resource *res; int ret; + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + if (!node_online(nid)) { pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) @@ -277,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } - /* register this memory as resource */ - ret = register_memory_resource(start, size); - return ret; error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); + if (res) + release_memory_resource(res); return ret; } -- cgit v1.1 From 1d7ea7324ae7a59f8e17e4ba76a2707c1e6f24d2 Mon Sep 17 00:00:00 2001 From: Alexander Zarochentsev Date: Sun, 13 Aug 2006 23:24:27 -0700 Subject: [PATCH] fuse: fix error case in fuse_readpages Don't let fuse_readpages leave the @pages list not empty when exiting on error. [akpm@osdl.org: kernel-doc fixes] Signed-off-by: Alexander Zarochentsev Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/swap.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 8fd095c..687686a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,6 +54,26 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/** + * put_pages_list(): release a list of pages + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + * + * @pages: list of pages threaded on page->lru + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the -- cgit v1.1 From b6b5bce3571e496504a89ee575d32101e0a98b93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 27 Aug 2006 01:23:25 -0700 Subject: [PATCH] swsusp: Fix swap_type_of There is a bug in mm/swapfile.c#swap_type_of() that makes swsusp only be able to use the first active swap partition as the resume device. Fix it. Signed-off-by: Rafael J. Wysocki Cc: Hugh Dickins Acked-by: Pavel Machek Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index e70d6c6..f1f5ec7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -442,11 +442,12 @@ int swap_type_of(dev_t device) if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { spin_unlock(&swap_lock); return i; } - inode = swap_info->swap_file->f_dentry->d_inode; + inode = swap_info[i].swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { spin_unlock(&swap_lock); -- cgit v1.1 From a302eb4e4602d6444ae75a0e516fb2f2c62d6642 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 31 Aug 2006 21:27:34 -0700 Subject: [PATCH] ZVC: Overstep counters Increments and decrements are usually grouped rather than mixed. We can optimize the inc and dec functions for that case. Increment and decrement the counters by 50% more than the threshold in those cases and set the differential accordingly. This decreases the need to update the atomic counters. The idea came originally from Andrew Morton. The overstepping alone was sufficient to address the contention issue found when updating the global and the per zone counters from 160 processors. Also remove some code in dec_zone_page_state. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index dfdf241..3799a0f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -190,8 +190,8 @@ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) (*p)++; if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); + *p = -STAT_THRESHOLD / 2; } } @@ -209,8 +209,8 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) (*p)--; if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); + *p = STAT_THRESHOLD /2; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -239,19 +239,9 @@ EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; - struct zone *zone; - s8 *p; - zone = page_zone(page); local_irq_save(flags); - p = diff_pointer(zone, item); - - (*p)--; - - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; - } + __dec_zone_page_state(page, item); local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); -- cgit v1.1 From df9ecaba3f152d1ea79f2a5e0b87505e03f47590 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 31 Aug 2006 21:27:35 -0700 Subject: [PATCH] ZVC: Scale thresholds depending on the size of the system The ZVC counter update threshold is currently set to a fixed value of 32. This patch sets up the threshold depending on the number of processors and the sizes of the zones in the system. With the current threshold of 32, I was able to observe slight contention when more than 130-140 processors concurrently updated the counters. The contention vanished when I either increased the threshold to 64 or used Andrew's idea of overstepping the interval (see ZVC overstep patch). However, we saw contention again at 220-230 processors. So we need higher values for larger systems. But the current default is already a bit of an overkill for smaller systems. Some systems have tiny zones where precision matters. For example i386 and x86_64 have 16M DMA zones and either 900M ZONE_NORMAL or ZONE_DMA32. These are even present on SMP and NUMA systems. The patch here sets up a threshold based on the number of processors in the system and the size of the zone that these counters are used for. The threshold should grow logarithmically, so we use fls() as an easy approximation. Results of tests on a system with 1024 processors (4TB RAM) The following output is from a test allocating 1GB of memory concurrently on each processor (Forking the process. So contention on mmap_sem and the pte locks is not a factor): X MIN TYPE: CPUS WALL WALL SYS USER TOTCPU fork 1 0.552 0.552 0.540 0.012 0.552 fork 4 0.552 0.548 2.164 0.036 2.200 fork 16 0.564 0.548 8.812 0.164 8.976 fork 128 0.580 0.572 72.204 1.208 73.412 fork 256 1.300 0.660 310.400 2.160 312.560 fork 512 3.512 0.696 1526.836 4.816 1531.652 fork 1020 20.024 0.700 17243.176 6.688 17249.863 So a threshold of 32 is fine up to 128 processors. At 256 processors contention becomes a factor. Overstepping the counter (earlier patch) improves the numbers a bit: fork 4 0.552 0.548 2.164 0.040 2.204 fork 16 0.552 0.548 8.640 0.148 8.788 fork 128 0.556 0.548 69.676 0.956 70.632 fork 256 0.876 0.636 212.468 2.108 214.576 fork 512 2.276 0.672 997.324 4.260 1001.584 fork 1020 13.564 0.680 11586.436 6.088 11592.523 Still contention at 512 and 1020. Contention at 1020 is down by a third. 256 still has a slight bit of contention. After this patch the counter threshold will be set to 125 which reduces contention significantly: fork 128 0.560 0.548 69.776 0.932 70.708 fork 256 0.636 0.556 143.460 2.036 145.496 fork 512 0.640 0.548 284.244 4.236 288.480 fork 1020 1.500 0.588 1326.152 8.892 1335.044 [akpm@osdl.org: !SMP build fix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 119 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 3799a0f..c1b5f41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,6 +12,7 @@ #include #include #include +#include void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -#define STAT_THRESHOLD 32 +static int calculate_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} /* - * Determine pointer to currently valid differential byte given a zone and - * the item number. - * - * Preemption must be off + * Refresh the thresholds for each zone. */ -static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +static void refresh_zone_stat_thresholds(void) { - return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; + struct zone *zone; + int cpu; + int threshold; + + for_each_zone(zone) { + + if (!zone->present_pages) + continue; + + threshold = calculate_threshold(zone); + + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; + } } /* @@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - s8 *p; + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; long x; - p = diff_pointer(zone, item); x = delta + *p; - if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; } EXPORT_SYMBOL(__mod_zone_page_state); @@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state); * No overflow check is necessary and therefore the differential can be * incremented or decremented in place which may allow the compilers to * generate better code. - * * The increment or decrement is known and therefore one boundary check can * be omitted. * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * * Some processors have inc/dec instructions that are atomic vs an interrupt. * However, the code must first determine the differential location in a zone * based on the processor number and then inc/dec the counter. There is no @@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state); */ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)++; - if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); - *p = -STAT_THRESHOLD / 2; + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; } } @@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { struct zone *zone = page_zone(page); - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)--; - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); - *p = STAT_THRESHOLD /2; + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif } seq_printf(m, "\n all_unreclaimable: %u" @@ -603,3 +670,35 @@ struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; + +int __init setup_vmstat(void) +{ + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + return 0; +} +module_init(setup_vmstat) +#endif -- cgit v1.1 From 0b1d647a02c5a1b67d45287eeb6cb3b2219c41c3 Mon Sep 17 00:00:00 2001 From: Pavel Mironchik Date: Thu, 31 Aug 2006 21:27:47 -0700 Subject: [PATCH] dm: work around mempool_alloc, bio_alloc_bioset deadlocks This patch works around a complex dm-related deadlock/livelock down in the mempool allocator. Alasdair said: Several dm targets suffer from this. Mempools are not yet used correctly everywhere in device-mapper: they can get shared when devices are stacked, and some targets share them across multiple instances. I made fixing this one of the prerequisites for this patch: md-dm-reduce-stack-usage-with-stacked-block-devices.patch which in some cases makes people more likely to hit the problem. There's been some progress on this recently with (unfinished) dm-crypt patches at: http://www.kernel.org/pub/linux/kernel/people/agk/patches/2.6/editing/ (dm-crypt-move-io-to-workqueue.patch plus dependencies) and: I've no problems with a temporary workaround like that, but Milan Broz (a new Redhat developer in the Czech Republic) has started reviewing all the mempool usage in device-mapper so I'm expecting we'll soon have a proper fix for this associated problems. [He's back from holiday at the start of next week.] For now, this sad-but-safe little patch will allow the machine to recover. [akpm@osdl.org: rewrote changelog] Cc: Alasdair G Kergon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index fe6e052..ccd8cb8 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -238,8 +238,13 @@ repeat_alloc: init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); smp_mb(); - if (!pool->curr_nr) - io_schedule(); + if (!pool->curr_nr) { + /* + * FIXME: this should be io_schedule(). The timeout is there + * as a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + } finish_wait(&pool->wait, &wait); goto repeat_alloc; -- cgit v1.1 From 3b98b087fc2daab67518d2baa8aef19a6ad82723 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 31 Aug 2006 21:27:53 -0700 Subject: [PATCH] fix NUMA interleaving for huge pages Since vma->vm_pgoff is in units of smallpages, VMAs for huge pages have the lower HPAGE_SHIFT - PAGE_SHIFT bits always cleared, which results in badd offsets to the interleave functions. Take this difference from small pages into account when calculating the offset. This does add a 0-bit shift into the small-page path (via alloc_page_vma()), but I think that is negligible. Also add a BUG_ON to prevent the offset from growing due to a negative right-shift, which probably shouldn't be allowed anyways. Tested on an 8-memory node ppc64 NUMA box and got the interleaving I expected. Signed-off-by: Nishanth Aravamudan Signed-off-by: Adam Litke Cc: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e07e27e..a9963ce 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else -- cgit v1.1