From 1fa44ecad2b86475e038aed81b0bf333fa484f8b Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 23 Feb 2006 12:43:43 -0600 Subject: [SCSI] add execute_in_process_context() API We have several points in the SCSI stack (primarily for our device functions) where we need to guarantee process context, but (given the place where the last reference was released) we cannot guarantee this. This API gets around the issue by executing the function directly if the caller has process context, but scheduling a workqueue to execute in process context if the caller doesn't have it. Signed-off-by: James Bottomley --- include/linux/workqueue.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 86b1113..957c21c 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -20,6 +20,10 @@ struct work_struct { struct timer_list timer; }; +struct execute_work { + struct work_struct work; +}; + #define __WORK_INITIALIZER(n, f, d) { \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ @@ -74,6 +78,8 @@ extern void init_workqueues(void); void cancel_rearming_delayed_work(struct work_struct *work); void cancel_rearming_delayed_workqueue(struct workqueue_struct *, struct work_struct *); +int execute_in_process_context(void (*fn)(void *), void *, + struct execute_work *); /* * Kill off a pending schedule_delayed_work(). Note that the work callback -- cgit v1.1 From 044cc6c8ec311c4ddeebfcc31c53dea282de70b7 Mon Sep 17 00:00:00 2001 From: "andrew.vasquez@qlogic.com" Date: Thu, 9 Mar 2006 14:27:13 -0800 Subject: [SCSI] qla2xxx: Add ISP54xx support. Chip is similar in form to our ISP24xx offering. Signed-off-by: Andrew Vasquez Signed-off-by: James Bottomley --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 82b83da..1afac93 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -852,6 +852,8 @@ #define PCI_DEVICE_ID_QLOGIC_ISP2432 0x2432 #define PCI_DEVICE_ID_QLOGIC_ISP2512 0x2512 #define PCI_DEVICE_ID_QLOGIC_ISP2522 0x2522 +#define PCI_DEVICE_ID_QLOGIC_ISP5422 0x5422 +#define PCI_DEVICE_ID_QLOGIC_ISP5432 0x5432 #define PCI_VENDOR_ID_CYRIX 0x1078 #define PCI_DEVICE_ID_CYRIX_5510 0x0000 -- cgit v1.1 From e935d5da8e5d12fabe5b632736c50eae0427e8c8 Mon Sep 17 00:00:00 2001 From: "Moore, Eric" Date: Tue, 14 Mar 2006 09:18:18 -0700 Subject: [SCSI] drivers/base/bus.c - export reprobe Adding support for exposing hidden raid components for sg interface. The sdev->no_uld_attach flag will set set accordingly. The sas module supports adding/removing raid volumes using online storage management application interface. This patch was provided to me by Christoph Hellwig. Signed-off-by: Eric Moore Signed-off-by: Greg Kroah-Hartman Signed-off-by: James Bottomley --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 58df18d..e8ac5bc 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -378,6 +378,7 @@ extern void device_bind_driver(struct device * dev); extern void device_release_driver(struct device * dev); extern int device_attach(struct device * dev); extern void driver_attach(struct device_driver * drv); +extern void device_reprobe(struct device *dev); /* -- cgit v1.1 From 30afc84cf7325e88fb9746340eba3c161080ff49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 18 Mar 2006 18:40:14 +0900 Subject: [SCSI] libata: implement minimal transport template for ->eh_timed_out SCSI midlayer has moved hostt->eh_timed_out to transport template. As libata doesn't need full-blown transport support yet, implement minimal transport for libata. No transport class or whatsoever, just empty transport template with ->eh_timed_out hook. Signed-off-by: Tejun Heo Signed-off-by: James Bottomley --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 239408e..204c37a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -508,7 +508,6 @@ extern void ata_host_set_remove(struct ata_host_set *host_set); extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg); extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); -extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd); extern int ata_scsi_error(struct Scsi_Host *host); extern void ata_eh_qc_complete(struct ata_queued_cmd *qc); extern void ata_eh_qc_retry(struct ata_queued_cmd *qc); -- cgit v1.1 From 4de151d8cd2553e7e89044ab5d72fcad4eb04afb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 22 Mar 2006 00:13:35 +0100 Subject: It's UTF-8 Fix some comments to "UTF-8". Signed-off-by: Alexey Dobriyan Signed-off-by: Adrian Bunk --- include/linux/msdos_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index e933e2a..8bcd945 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -199,7 +199,7 @@ struct fat_mount_options { sys_immutable:1, /* set = system files are immutable */ dotsOK:1, /* set = hidden and system files are named '.filename' */ isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ - utf8:1, /* Use of UTF8 character set (Default) */ + utf8:1, /* Use of UTF-8 character set (Default) */ unicode_xlate:1, /* create escape sequences for unhandled Unicode */ numtail:1, /* Does first alias have a numeric '~1' type tail? */ atari:1, /* Use Atari GEMDOS variation of MS-DOS fs */ -- cgit v1.1 From 116f232b3794a8b6ebde21aef5004b18cc1cfa86 Mon Sep 17 00:00:00 2001 From: Rytchkov Alexey Date: Wed, 22 Mar 2006 00:58:53 +0100 Subject: fixed path to moved file in include/linux/device.h Signed-off-by: Adrian Bunk --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 5b595fd..10c1693 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -399,7 +399,7 @@ extern struct device * get_device(struct device * dev); extern void put_device(struct device * dev); -/* drivers/base/power.c */ +/* drivers/base/power/shutdown.c */ extern void device_shutdown(void); -- cgit v1.1 From 89bbfc95d65839d6ae23ddab8a3cc5af4ae88383 Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Tue, 21 Mar 2006 23:58:08 -0800 Subject: [NET]: allow 32 bit socket ioctl in 64 bit kernel Since the register_ioctl32_conversion() patch in the kernel is now obsolete, provide another method to allow 32 bit user space ioctls to reach the kernel. Signed-off-by: Shaun Pereira Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 152fa65..84a490e 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -143,6 +143,8 @@ struct proto_ops { struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); + int (*compat_ioctl) (struct socket *sock, unsigned int cmd, + unsigned long arg); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, @@ -251,6 +253,8 @@ SOCKCALL_UWRAP(name, poll, (struct file *file, struct socket *sock, struct poll_ (file, sock, wait)) \ SOCKCALL_WRAP(name, ioctl, (struct socket *sock, unsigned int cmd, \ unsigned long arg), (sock, cmd, arg)) \ +SOCKCALL_WRAP(name, compat_ioctl, (struct socket *sock, unsigned int cmd, \ + unsigned long arg), (sock, cmd, arg)) \ SOCKCALL_WRAP(name, listen, (struct socket *sock, int len), (sock, len)) \ SOCKCALL_WRAP(name, shutdown, (struct socket *sock, int flags), (sock, flags)) \ SOCKCALL_WRAP(name, setsockopt, (struct socket *sock, int level, int optname, \ @@ -275,6 +279,7 @@ static const struct proto_ops name##_ops = { \ .getname = __lock_##name##_getname, \ .poll = __lock_##name##_poll, \ .ioctl = __lock_##name##_ioctl, \ + .compat_ioctl = __lock_##name##_compat_ioctl, \ .listen = __lock_##name##_listen, \ .shutdown = __lock_##name##_shutdown, \ .setsockopt = __lock_##name##_setsockopt, \ @@ -283,6 +288,7 @@ static const struct proto_ops name##_ops = { \ .recvmsg = __lock_##name##_recvmsg, \ .mmap = __lock_##name##_mmap, \ }; + #endif #define MODULE_ALIAS_NETPROTO(proto) \ -- cgit v1.1 From a64b7b936dcd926ace745c07c14f45ecfaddb034 Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Mar 2006 00:01:31 -0800 Subject: [X25]: allow ITU-T DTE facilities for x25 Allows use of the optional user facility to insert ITU-T (http://www.itu.int/ITU-T/) specified DTE facilities in call set-up x25 packets. This feature is optional; no facilities will be added if the ioctl is not used, and call setup packet remains the same as before. If the ioctls provided by the patch are used, then a facility marker will be added to the x25 packet header so that the called dte address extension facility can be differentiated from other types of facilities (as described in the ITU-T X.25 recommendation) that are also allowed in the x25 packet header. Facility markers are made up of two octets, and may be present in the x25 packet headers of call-request, incoming call, call accepted, clear request, and clear indication packets. The first of the two octets represents the facility code field and is set to zero by this patch. The second octet of the marker represents the facility parameter field and is set to 0x0F because the marker will be inserted before ITU-T type DTE facilities. Since according to ITU-T X.25 Recommendation X.25(10/96)- 7.1 "All networks will support the facility markers with a facility parameter field set to all ones or to 00001111", therefore this patch should work with all x.25 networks. While there are many ITU-T DTE facilities, this patch implements only the called and calling address extension, with placeholders in the x25_dte_facilities structure for the rest of the facilities. Testing: This patch was tested using a cisco xot router connected on its serial ports to an X.25 network, and on its lan ports to a host running an xotd daemon. It is also possible to test this patch using an xotd daemon and an x25tap patch, where the xotd daemons work back-to-back without actually using an x.25 network. See www.fyonne.net for details on how to do this. Signed-off-by: Shaun Pereira Acked-by: Andrew Hendry Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/x25.h b/include/linux/x25.h index 16d4493..d035e4e 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -11,6 +11,8 @@ #ifndef X25_KERNEL_H #define X25_KERNEL_H +#include + #define SIOCX25GSUBSCRIP (SIOCPROTOPRIVATE + 0) #define SIOCX25SSUBSCRIP (SIOCPROTOPRIVATE + 1) #define SIOCX25GFACILITIES (SIOCPROTOPRIVATE + 2) @@ -21,6 +23,8 @@ #define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) #define SIOCX25CALLACCPTAPPRV (SIOCPROTOPRIVATE + 8) #define SIOCX25SENDCALLACCPT (SIOCPROTOPRIVATE + 9) +#define SIOCX25GDTEFACILITIES (SIOCPROTOPRIVATE + 10) +#define SIOCX25SDTEFACILITIES (SIOCPROTOPRIVATE + 11) /* * Values for {get,set}sockopt. @@ -77,6 +81,8 @@ struct x25_subscrip_struct { #define X25_MASK_PACKET_SIZE 0x04 #define X25_MASK_WINDOW_SIZE 0x08 +#define X25_MASK_CALLING_AE 0x10 +#define X25_MASK_CALLED_AE 0x20 /* @@ -99,6 +105,26 @@ struct x25_facilities { }; /* +* ITU DTE facilities +* Only the called and calling address +* extension are currently implemented. +* The rest are in place to avoid the struct +* changing size if someone needs them later +*/ + +struct x25_dte_facilities { + __u16 delay_cumul; + __u16 delay_target; + __u16 delay_max; + __u8 min_throughput; + __u8 expedited; + __u8 calling_len; + __u8 called_len; + __u8 calling_ae[20]; + __u8 called_ae[20]; +}; + +/* * Call User Data structure. */ struct x25_calluserdata { -- cgit v1.1 From 9d2f928ddf64ca0361562e30faf584cd33055c60 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 22 Mar 2006 10:53:19 +0100 Subject: [PATCH] Intruduce DMA_28BIT_MASK This patch introduces the DMA_28BIT_MASK constant in dma-mapping.h ALSA drivers using this mask are changed to use the new constant. Signed-off-by: Tobias Klauser Acked-by: Takashi Iwai Acked-by: Jaroslav Kysela --- include/linux/dma-mapping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2d80cc7..a873106 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -20,6 +20,7 @@ enum dma_data_direction { #define DMA_31BIT_MASK 0x000000007fffffffULL #define DMA_30BIT_MASK 0x000000003fffffffULL #define DMA_29BIT_MASK 0x000000001fffffffULL +#define DMA_28BIT_MASK 0x000000000fffffffULL #include -- cgit v1.1 From 4024ce5e0f396447cc1e07fd65c2a1d056b066bb Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 22 Mar 2006 00:07:43 -0800 Subject: [PATCH] rtc.h broke strace(1) builds Git patch 52dfa9a64cfb3dd01fa1ee1150d589481e54e28e [PATCH] move rtc_interrupt() prototype to rtc.h broke strace(1) builds. The below moves the kernel-only additions lower, under the already provided #ifdef __KERNEL__ statement. Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rtc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 0b2ba67..b739ac1 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -11,8 +11,6 @@ #ifndef _LINUX_RTC_H_ #define _LINUX_RTC_H_ -#include - /* * The struct used to pass data via the following ioctl. Similar to the * struct tm in , but it needs to be here so that the kernel @@ -95,6 +93,8 @@ struct rtc_pll_info { #ifdef __KERNEL__ +#include + typedef struct rtc_task { void (*func)(void *private_data); void *private_data; -- cgit v1.1 From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:07:59 -0800 Subject: [PATCH] mm: PageLRU no testset PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d52999c..58856c8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -239,10 +239,9 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define __ClearPageDirty(page) __clear_bit(PG_dirty, &(page)->flags) #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags) -#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) -- cgit v1.1 From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: PageActive no testset PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 58856c8..5d1e7bd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -246,8 +246,6 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) -#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) -- cgit v1.1 From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: less atomic ops In the page release paths, we can be sure that nobody will mess with our page->flags because the refcount has dropped to 0. So no need for atomic operations here. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- include/linux/page-flags.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8ac854f..3b6723d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page) { list_del(&page->lru); if (PageActive(page)) { - ClearPageActive(page); + __ClearPageActive(page); zone->nr_active--; } else { zone->nr_inactive--; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5d1e7bd..da71d63 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -242,10 +242,12 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define PageLRU(page) test_bit(PG_lru, &(page)->flags) #define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) +#define __ClearPageLRU(page) __clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) -- cgit v1.1 From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:01 -0800 Subject: [PATCH] mm: page_alloc less atomics More atomic operation removal from page allocator Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index da71d63..76c7ffd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) #define PageCompound(page) test_bit(PG_compound, &(page)->flags) -#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) -#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define __SetPageCompound(page) __set_bit(PG_compound, &(page)->flags) +#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags) #ifdef CONFIG_SWAP #define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) -- cgit v1.1 From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:02 -0800 Subject: [PATCH] mm: slab less atomics Atomic operation removal from slab Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 76c7ffd..8cef69d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -250,10 +250,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) -#define ClearPageSlab(page) clear_bit(PG_slab, &(page)->flags) -#define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags) -#define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) +#define __SetPageSlab(page) __set_bit(PG_slab, &(page)->flags) +#define __ClearPageSlab(page) __clear_bit(PG_slab, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) is_highmem(page_zone(page)) -- cgit v1.1 From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:03 -0800 Subject: [PATCH] mm: simplify vmscan vs release refcounting The VM has an interesting race where a page refcount can drop to zero, but it is still on the LRU lists for a short time. This was solved by testing a 0->1 refcount transition when picking up pages from the LRU, and dropping the refcount in that case. Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page from the LRU, thus a 0 refcount page will never have its refcount elevated until it is allocated again. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 498ff87..b12d5c7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -301,17 +301,20 @@ struct page { * Drop a ref, return true if the logical refcount fell to zero (the page has * no users) */ -#define put_page_testzero(p) \ - ({ \ - BUG_ON(atomic_read(&(p)->_count) == -1);\ - atomic_add_negative(-1, &(p)->_count); \ - }) +static inline int put_page_testzero(struct page *page) +{ + BUG_ON(atomic_read(&page->_count) == -1); + return atomic_add_negative(-1, &page->_count); +} /* - * Grab a ref, return true if the page previously had a logical refcount of - * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page + * Try to grab a ref unless the page has a refcount of zero, return false if + * that is the case. */ -#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) +static inline int get_page_unless_zero(struct page *page) +{ + return atomic_add_unless(&page->_count, 1, -1); +} #define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) #define __put_page(p) atomic_dec(&(p)->_count) -- cgit v1.1 From 8dc04efbfb3c08a08fb7a3b97348d5d561b26ae2 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:03 -0800 Subject: [PATCH] mm: de-skew page refcounting atomic_add_unless (atomic_inc_not_zero) no longer requires an offset refcount to function correctly. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b12d5c7..9bbddf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -286,15 +286,6 @@ struct page { * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. - * - * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we - * can use atomic_add_negative(-1, page->_count) to detect when the page - * becomes free and so that we can also use atomic_inc_and_test to atomically - * detect when we just tried to grab a ref on a page which some other CPU has - * already deemed to be freeable. - * - * NO code should make assumptions about this internal detail! Use the provided - * macros which retain the old rules: page_count(page) == 0 is a free page. */ /* @@ -303,8 +294,8 @@ struct page { */ static inline int put_page_testzero(struct page *page) { - BUG_ON(atomic_read(&page->_count) == -1); - return atomic_add_negative(-1, &page->_count); + BUG_ON(atomic_read(&page->_count) == 0); + return atomic_dec_and_test(&page->_count); } /* @@ -313,10 +304,10 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - return atomic_add_unless(&page->_count, 1, -1); + return atomic_inc_not_zero(&page->_count); } -#define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) +#define set_page_count(p,v) atomic_set(&(p)->_count, (v)) #define __put_page(p) atomic_dec(&(p)->_count) extern void FASTCALL(__page_cache_release(struct page *)); @@ -325,7 +316,7 @@ static inline int page_count(struct page *page) { if (PageCompound(page)) page = (struct page *)page_private(page); - return atomic_read(&page->_count) + 1; + return atomic_read(&page->_count); } static inline void get_page(struct page *page) -- cgit v1.1 From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:05 -0800 Subject: [PATCH] mm: split highorder pages Have an explicit mm call to split higher order pages into individual pages. Should help to avoid bugs and be more explicit about the code's intention. Signed-off-by: Nick Piggin Cc: Russell King Cc: David Howells Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Zankel Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bbddf2..e679806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -328,6 +328,12 @@ static inline void get_page(struct page *page) void put_page(struct page *page); +#ifdef CONFIG_MMU +void split_page(struct page *page, unsigned int order); +#else +static inline void split_page(struct page *page, unsigned int order) {} +#endif + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of -- cgit v1.1 From 9d41415221214ca4820b9464dfa548e2f20e7dd5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:06 -0800 Subject: [PATCH] mm: page_state comment more Clarify that preemption needs to be guarded against with the __xxx_page_state functions. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8cef69d..9ea629c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,8 +86,9 @@ * - The __xxx_page_state variants can be used safely when interrupts are * disabled. * - The __xxx_page_state variants can be used if the field is only - * modified from process context, or only modified from interrupt context. - * In this case, the field should be commented here. + * modified from process context and protected from preemption, or only + * modified from interrupt context. In this case, the field should be + * commented here. */ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ -- cgit v1.1 From b50ec7d8070ae7a39fe78e65a8812bbc3ca2f7ac Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 22 Mar 2006 00:08:09 -0800 Subject: [PATCH] kcalloc(): INT_MAX -> ULONG_MAX Since size_t has the same size as a long on all architectures, it's enough for overflow checks to check against ULONG_MAX. This change could allow a compiler better optimization (especially in the n=1 case). The practical effect seems to be positive, but quite small: text data bss dec hex filename 21762380 5859870 1848928 29471178 1c1b1ca vmlinux-old 21762211 5859870 1848928 29471009 1c1b121 vmlinux-patched Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 8cf5293..38bed95 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -118,7 +118,7 @@ extern void *kzalloc(size_t, gfp_t); */ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) { - if (n != 0 && size > INT_MAX / n) + if (n != 0 && size > ULONG_MAX / n) return NULL; return kzalloc(n * size, flags); } -- cgit v1.1 From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:15 -0800 Subject: [PATCH] slab: Remove SLAB_NO_REAP option SLAB_NO_REAP is documented as an option that will cause this slab not to be reaped under memory pressure. However, that is not what happens. The only thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused slab elements that were allocated in batch in cache_reap(). Cache_reap() is run every few seconds independently of memory pressure. Could we remove the whole thing? Its only used by three slabs anyways and I cannot find a reason for having this option. There is an additional problem with SLAB_NO_REAP. If set then the recovery of objects from alien caches is switched off. Objects not freed on the same node where they were initially allocated will only be reused if a certain amount of objects accumulates from one alien node (not very likely) or if the cache is explicitly shrunk. (Strangely __cache_shrink does not check for SLAB_NO_REAP) Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 38bed95..2b28c84 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t; #define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */ #define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */ #define SLAB_POISON 0x00000800UL /* Poison objects */ -#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */ #define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ #define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ #define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ -- cgit v1.1 From 78eef01b0fae087c5fadbd85dd4fe2918c3a015f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:16 -0800 Subject: [PATCH] on_each_cpu(): disable local interrupts When on_each_cpu() runs the callback on other CPUs, it runs with local interrupts disabled. So we should run the function with local interrupts disabled on this CPU, too. And do the same for UP, so the callback is run in the same environment on both UP and SMP. (strictly it should do preempt_disable() too, but I think local_irq_disable is sufficiently equivalent). Also uninlines on_each_cpu(). softirq.c was the most appropriate file I could find, but it doesn't seem to justify creating a new file. Oh, and fix up that comment over (under?) x86's smp_call_function(). It drives me nuts. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 44153fd..d699a16 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ -extern int smp_call_function (void (*func) (void *info), void *info, - int retry, int wait); +int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); /* * Call a function on all processors */ -static inline int on_each_cpu(void (*func) (void *info), void *info, - int retry, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, retry, wait); - func(info); - preempt_enable(); - return ret; -} +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait); #define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */ #define MSG_ALL 0x8001 @@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void); #define raw_smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) -#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) +#define on_each_cpu(func,info,retry,wait) \ + ({ \ + local_irq_disable(); \ + func(info); \ + local_irq_enable(); \ + 0; \ + }) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) -- cgit v1.1 From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:19 -0800 Subject: [PATCH] vmscan: use unsigned longs Turn basically everything in vmscan.c into `unsigned long'. This is to avoid the possibility that some piece of code in there might decide to operate upon more than 4G (or even 2G) of pages in one hit. This might be silly, but we'll need it one day. Cc: Christoph Lameter Cc: Nick Piggin Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/swap.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e679806..1850cf8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1046,7 +1046,7 @@ int in_gate_area_no_task(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages); void drop_pagecache(void); void drop_slab(void); diff --git a/include/linux/swap.h b/include/linux/swap.h index d572b19..3dc6c89 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,8 +172,8 @@ extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, gfp_t); -extern int shrink_all_memory(int); +extern unsigned long try_to_free_pages(struct zone **, gfp_t); +extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; #ifdef CONFIG_NUMA @@ -190,11 +190,11 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p); -extern int putback_lru_pages(struct list_head *l); +extern unsigned long putback_lru_pages(struct list_head *l); extern int migrate_page(struct page *, struct page *); extern void migrate_page_copy(struct page *, struct page *); extern int migrate_page_remove_references(struct page *, struct page *, int); -extern int migrate_pages(struct list_head *l, struct list_head *t, +extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed); extern int fail_migrate_page(struct page *, struct page *); #else -- cgit v1.1 From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:33 -0800 Subject: [PATCH] mm: make __put_page internal Remove __put_page from outside the core mm/. It is dangerous because it does not handle compound pages nicely, and misses 1->0 transitions. If a user later appears that really needs the extra speed we can reevaluate. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1850cf8..9b3cdfc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,7 +308,6 @@ static inline int get_page_unless_zero(struct page *page) } #define set_page_count(p,v) atomic_set(&(p)->_count, (v)) -#define __put_page(p) atomic_dec(&(p)->_count) extern void FASTCALL(__page_cache_release(struct page *)); -- cgit v1.1 From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:34 -0800 Subject: [PATCH] mm: nommu use compound pages Now that compound page handling is properly fixed in the VM, move nommu over to using compound pages rather than rolling their own refcounting. nommu vm page refcounting is broken anyway, but there is no need to have divergent code in the core VM now, nor when it gets fixed. Signed-off-by: Nick Piggin Cc: David Howells (Needs testing, please). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b3cdfc..3d84b7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -327,11 +327,7 @@ static inline void get_page(struct page *page) void put_page(struct page *page); -#ifdef CONFIG_MMU void split_page(struct page *page, unsigned int order); -#else -static inline void split_page(struct page *page, unsigned int order) {} -#endif /* * Multiple processes may "see" the same page. E.g. for untouched -- cgit v1.1 From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:40 -0800 Subject: [PATCH] remove set_page_count() outside mm/ set_page_count usage outside mm/ is limited to setting the refcount to 1. Remove set_page_count from outside mm/, and replace those users with init_page_count() and set_page_refcounted(). This allows more debug checking, and tighter control on how code is allowed to play around with page->_count. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d84b7a..7d8c127 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -307,8 +307,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -#define set_page_count(p,v) atomic_set(&(p)->_count, (v)) - extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) @@ -325,6 +323,15 @@ static inline void get_page(struct page *page) atomic_inc(&page->_count); } +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + atomic_set(&page->_count, 1); +} + void put_page(struct page *page); void split_page(struct page *page, unsigned int order); -- cgit v1.1 From 617d2214ee06c209e5c375c280d50abace8058e1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:43 -0800 Subject: [PATCH] mm: optimise page_count Optimise page_count compound page test and make it consistent with similar functions. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d8c127..6aa016f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -311,7 +311,7 @@ extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) { - if (PageCompound(page)) + if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); return atomic_read(&page->_count); } -- cgit v1.1 From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 68d82ad..fa83836 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot); #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 @@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void) #define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_change_protection(vma, address, end, newprot) + #ifndef HPAGE_MASK #define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ #define HPAGE_SIZE PAGE_SIZE -- cgit v1.1 From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:55 -0800 Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fa83836..cafe73e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); -int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); void free_huge_page(struct page *); @@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() -#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL @@ -132,6 +130,8 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { struct shared_policy policy; + /* Protected by the (global) hugetlb_lock */ + unsigned long prereserved_hpages; struct inode vfs_inode; }; @@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast_hpages); +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost_hpages); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); -- cgit v1.1 From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:56 -0800 Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local Originally, mm/hugetlb.c just handled the hugepage physical allocation path and its {alloc,free}_huge_page() functions were used from the arch specific hugepage code. These days those functions are only used with mm/hugetlb.c itself. Therefore, this patch makes them static and removes their prototypes from hugetlb.h. This requires a small rearrangement of code in mm/hugetlb.c to avoid a forward declaration. This patch causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cafe73e..5d84c36 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); -void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ do { } while (0) -#define alloc_huge_page(vma, addr) ({ NULL; }) -#define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) -- cgit v1.1 From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:57 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() free_pgtables() has special logic to call hugetlb_free_pgd_range() instead of the normal free_pgd_range() on hugepage VMAs. However, the test it uses to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized range at the start of the vma. is_hugepage_only_range() will return true if the given range has any intersection with a hugepage address region, and in this case the given region need not be hugepage aligned. So, for example, this test can return true if called on, say, a 4k VMA immediately preceding a (nicely aligned) hugepage VMA. At present we get away with this because the powerpc version of hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the only other arch with a non-trivial is_hugepage_only_range()) we get away with it for a different reason; the hugepage area is not contiguous with the rest of the user address space, and VMAs are not permitted in between, so the test can't return a false positive there. Nonetheless this should be fixed. We do that in the patch below by replacing the is_hugepage_only_range() test with an explicit test of the VMA using is_vm_hugetlb_page(). This in turn changes behaviour for platforms where is_hugepage_only_range() returns false always (everything except powerpc and ia64). We address this by ensuring that hugetlb_free_pgd_range() is defined to be identical to free_pgd_range() (instead of a no-op) on everything except ia64. Even so, it will prevent some otherwise possible coalescing of calls down to free_pgd_range(). Since this only happens for hugepage VMAs, removing this small optimization seems unlikely to cause any trouble. This patch causes no regressions on the libhugetlbfs testsuite - ppc64 POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP). Signed-off-by: David Gibson Cc: William Lee Irwin III Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5d84c36..e465fbf 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#endif + +#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE +#define hugetlb_free_pgd_range free_pgd_range #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE @@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void) #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) -- cgit v1.1 From 3915bcf38fe0b6d130b4bbde97804f29a0becf32 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:59 -0800 Subject: [PATCH] hugepage: Move hugetlb_free_pgd_range() prototype to hugetlb.h The optional hugepage callback, hugetlb_free_pgd_range() is presently implemented non-trivially only on ia64 (but I plan to add one for powerpc shortly). It has its own prototype for the function in asm-ia64/pgtable.h. However, since the function is called from generic code, it make sense for its prototype to be in the generic hugetlb.h header file, as the protypes other arch callbacks already are (prepare_hugepage_range(), set_huge_pte_at(), etc.). This patch makes it so. Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e465fbf..5db25ff 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -47,6 +47,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE #define hugetlb_free_pgd_range free_pgd_range +#else +void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + unsigned long end, unsigned long floor, + unsigned long ceiling); #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -- cgit v1.1 From 42b88befd6e0dae1a5fe04c03925037fa890e1f3 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:09:01 -0800 Subject: [PATCH] hugepage: is_aligned_hugepage_range() cleanup Quite a long time back, prepare_hugepage_range() replaced is_aligned_hugepage_range() as the callback from mm/mmap.c to arch code to verify if an address range is suitable for a hugepage mapping. is_aligned_hugepage_range() stuck around, but only to implement prepare_hugepage_range() on archs which didn't implement their own. Most archs (everything except ia64 and powerpc) used the same implementation of is_aligned_hugepage_range(). On powerpc, which implements its own prepare_hugepage_range(), the custom version was never used. In addition, "is_aligned_hugepage_range()" was a bad name, because it suggests it returns true iff the given range is a good hugepage range, whereas in fact it returns 0-or-error (so the sense is reversed). This patch cleans up by abolishing is_aligned_hugepage_range(). Instead prepare_hugepage_range() is defined directly. Most archs use the default version, which simply checks the given region is aligned to the size of a hugepage. ia64 and powerpc define custom versions. The ia64 one simply checks that the range is in the correct address space region in addition to being suitably aligned. The powerpc version (just as previously) checks for suitable addresses, and if necessary performs low-level MMU frobbing to set up new areas for use by hugepages. No libhugetlbfs testsuite regressions on ppc64 (POWER5 LPAR). Signed-off-by: David Gibson Signed-off-by: Zhang Yanmin Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5db25ff..d6f1019 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -36,7 +36,6 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); @@ -54,8 +53,18 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -#define prepare_hugepage_range(addr, len) \ - is_aligned_hugepage_range(addr, len) +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} #else int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif @@ -95,7 +104,6 @@ static inline unsigned long hugetlb_total_pages(void) #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL -#define is_aligned_hugepage_range(addr, len) 0 #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -- cgit v1.1 From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:12 -0800 Subject: [PATCH] page migration reorg Centralize the page migration functions in anticipation of additional tinkering. Creates a new file mm/migrate.c 1. Extract buffer_migrate_page() from fs/buffer.c 2. Extract central migration code from vmscan.c 3. Extract some components from mempolicy.c 4. Export pageout() and remove_from_swap() from vmscan.c 5. Make it possible to configure NUMA systems without page migration and non-NUMA systems with page migration. I had to so some #ifdeffing in mempolicy.c that may need a cleanup. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/swap.h | 34 +++++++++++++++------------------- 2 files changed, 51 insertions(+), 19 deletions(-) create mode 100644 include/linux/migrate.h (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h new file mode 100644 index 0000000..7d09962 --- /dev/null +++ b/include/linux/migrate.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_MIGRATE_H +#define _LINUX_MIGRATE_H + +#include +#include + +#ifdef CONFIG_MIGRATION +extern int isolate_lru_page(struct page *p, struct list_head *pagelist); +extern int putback_lru_pages(struct list_head *l); +extern int migrate_page(struct page *, struct page *); +extern void migrate_page_copy(struct page *, struct page *); +extern int migrate_page_remove_references(struct page *, struct page *, int); +extern int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed); +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest); +extern int fail_migrate_page(struct page *, struct page *); + +extern int migrate_prep(void); + +#else + +static inline int isolate_lru_page(struct page *p, struct list_head *list) + { return -ENOSYS; } +static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed) { return -ENOSYS; } + +static inline int migrate_prep(void) { return -ENOSYS; } + +/* Possible settings for the migrate_page() method in address_operations */ +#define migrate_page NULL +#define fail_migrate_page NULL + +#endif /* CONFIG_MIGRATION */ +#endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 3dc6c89..12415dd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,6 +175,21 @@ extern void swap_setup(void); extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int remove_mapping(struct address_space *mapping, struct page *page); + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +extern pageout_t pageout(struct page *page, struct address_space *mapping); #ifdef CONFIG_NUMA extern int zone_reclaim_mode; @@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_MIGRATION -extern int isolate_lru_page(struct page *p); -extern unsigned long putback_lru_pages(struct list_head *l); -extern int migrate_page(struct page *, struct page *); -extern void migrate_page_copy(struct page *, struct page *); -extern int migrate_page_remove_references(struct page *, struct page *, int); -extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed); -extern int fail_migrate_page(struct page *, struct page *); -#else -static inline int isolate_lru_page(struct page *p) { return -ENOSYS; } -static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed) { return -ENOSYS; } -/* Possible settings for the migrate_page() method in address_operations */ -#define migrate_page NULL -#define fail_migrate_page NULL -#endif - #ifdef CONFIG_MMU /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); -- cgit v1.1