From e78a77c38cf0ce3b8169ff6a6fd3711e81dc22c8 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: Minor cleanups - uninline timer functions; the compiler knows better than we do whether or not to inline these. - mfgpt_start_timer() had an unused 'clock' argument, drop it. From both Jordan and myself. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 219f86e..9146b2d 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -244,7 +244,7 @@ static int __init mfgpt_setup(char *str) } __setup("mfgpt_irq=", mfgpt_setup); -static inline void mfgpt_disable_timer(u16 clock) +static void mfgpt_disable_timer(u16 clock) { u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP); geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN); @@ -263,7 +263,7 @@ static struct clock_event_device mfgpt_clockevent = { .shift = 32 }; -static inline void mfgpt_start_timer(u16 clock, u16 delta) +static void mfgpt_start_timer(u16 delta) { geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); @@ -278,14 +278,14 @@ static void mfgpt_set_mode(enum clock_event_mode mode, mfgpt_disable_timer(mfgpt_event_clock); if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(mfgpt_event_clock, MFGPT_PERIODIC); + mfgpt_start_timer(MFGPT_PERIODIC); mfgpt_tick_mode = mode; } static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) { - mfgpt_start_timer(mfgpt_event_clock, delta); + mfgpt_start_timer(delta); return 0; } -- cgit v1.1 From 36445cf30686b9ea4ddf71f28057e4dd07db0e2d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE fix MFGPT input clock value The GEODE MFGPT code assumed that 32kHz was 32000 Hz while the boards run on a 32.768 kHz digital watch crystal. In practise, it will not change the timer's frequency as the skew was only 2.4%, but it should provide more accurate intervals. Signed-off-by: Willy Tarreau Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 9146b2d..5862281 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -12,21 +12,20 @@ */ /* - * We are using the 32Khz input clock - its the only one that has the + * We are using the 32.768kHz input clock - it's the only one that has the * ranges we find desirable. The following table lists the suitable - * divisors and the associated hz, minimum interval - * and the maximum interval: + * divisors and the associated Hz, minimum interval and the maximum interval: * - * Divisor Hz Min Delta (S) Max Delta (S) - * 1 32000 .0005 2.048 - * 2 16000 .001 4.096 - * 4 8000 .002 8.192 - * 8 4000 .004 16.384 - * 16 2000 .008 32.768 - * 32 1000 .016 65.536 - * 64 500 .032 131.072 - * 128 250 .064 262.144 - * 256 125 .128 524.288 + * Divisor Hz Min Delta (s) Max Delta (s) + * 1 32768 .00048828125 2.000 + * 2 16384 .0009765625 4.000 + * 4 8192 .001953125 8.000 + * 8 4096 .00390625 16.000 + * 16 2048 .0078125 32.000 + * 32 1024 .015625 64.000 + * 64 512 .03125 128.000 + * 128 256 .0625 256.000 + * 256 128 .125 512.000 */ #include @@ -45,7 +44,7 @@ static struct mfgpt_timer_t { #define MFGPT_DIVISOR 16 #define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32000 / MFGPT_DIVISOR) +#define MFGPT_HZ (32768 / MFGPT_DIVISOR) #define MFGPT_PERIODIC (MFGPT_HZ / HZ) #ifdef CONFIG_GEODE_MFGPT_TIMER -- cgit v1.1 From fa28e067c3b8af96c79c060e163b1387c172ae75 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: drop module owner usage from MFGPT API We had planned to use the 'owner' field for allowing re-allocation of MFGPTs; however, doing it by module owner name isn't flexible enough. So, drop this for now. If it turns out that we need timers in modules, we'll need to come up with a scheme that matches the write-once fields of the MFGPTx_SETUP register, and drops ponies from the sky. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 5862281..186bd36 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -30,14 +30,12 @@ #include #include -#include #include #define F_AVAIL 0x01 static struct mfgpt_timer_t { int flags; - struct module *owner; } mfgpt_timers[MFGPT_MAX_TIMERS]; /* Selected from the table above */ @@ -182,15 +180,14 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) return 0; } -static int mfgpt_get(int timer, struct module *owner) +static int mfgpt_get(int timer) { mfgpt_timers[timer].flags &= ~F_AVAIL; - mfgpt_timers[timer].owner = owner; printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); return timer; } -int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) +int geode_mfgpt_alloc_timer(int timer, int domain) { int i; @@ -203,7 +200,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) /* Try to find an available timer */ for (i = 0; i < MFGPT_MAX_TIMERS; i++) { if (mfgpt_timers[i].flags & F_AVAIL) - return mfgpt_get(i, owner); + return mfgpt_get(i); if (i == 5 && domain == MFGPT_DOMAIN_WORKING) break; @@ -211,7 +208,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain, struct module *owner) } else { /* If they requested a specific timer, try to honor that */ if (mfgpt_timers[timer].flags & F_AVAIL) - return mfgpt_get(timer, owner); + return mfgpt_get(timer); } /* No timers available - too bad */ @@ -324,8 +321,7 @@ static int __init mfgpt_timer_setup(void) int timer, ret; u16 val; - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING, - THIS_MODULE); + timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); if (timer < 0) { printk(KERN_ERR "mfgpt-timer: Could not allocate a MFPGT timer\n"); -- cgit v1.1 From 9501b2efd70ad3957a70d44de54dab7c52f9b882 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: replace 'flags' field with 'avail' bit Drop F_AVAIL and the 'flags' field, replacing with an 'avail' bit. This looks more understandable to me. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 186bd36..6f79061 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -32,10 +32,8 @@ #include #include -#define F_AVAIL 0x01 - static struct mfgpt_timer_t { - int flags; + unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; /* Selected from the table above */ @@ -95,7 +93,7 @@ int __init geode_mfgpt_detect(void) for (i = 0; i < MFGPT_MAX_TIMERS; i++) { val = geode_mfgpt_read(i, MFGPT_REG_SETUP); if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].flags = F_AVAIL; + mfgpt_timers[i].avail = 1; count++; } } @@ -182,7 +180,7 @@ int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) static int mfgpt_get(int timer) { - mfgpt_timers[timer].flags &= ~F_AVAIL; + mfgpt_timers[timer].avail = 0; printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); return timer; } @@ -199,7 +197,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) if (timer < 0) { /* Try to find an available timer */ for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].flags & F_AVAIL) + if (mfgpt_timers[i].avail) return mfgpt_get(i); if (i == 5 && domain == MFGPT_DOMAIN_WORKING) @@ -207,7 +205,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) } } else { /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].flags & F_AVAIL) + if (mfgpt_timers[timer].avail) return mfgpt_get(timer); } -- cgit v1.1 From b0e6bf2571e9385335e6337bdedb85cb629ab3fb Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: make mfgpt_timer_setup available outside of mfgpt_32.c We need to be called from elsewhere, and this gets some #ifdefs out of the .c file. Signed-off-by: Andres Salomon Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 6f79061..5cf3a83 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -43,12 +43,6 @@ static struct mfgpt_timer_t { #define MFGPT_HZ (32768 / MFGPT_DIVISOR) #define MFGPT_PERIODIC (MFGPT_HZ / HZ) -#ifdef CONFIG_GEODE_MFGPT_TIMER -static int __init mfgpt_timer_setup(void); -#else -#define mfgpt_timer_setup() (0) -#endif - /* Allow for disabling of MFGPTs */ static int disable; static int __init mfgpt_disable(char *s) @@ -314,7 +308,7 @@ static struct irqaction mfgptirq = { .name = "mfgpt-timer" }; -static int __init mfgpt_timer_setup(void) +int __init mfgpt_timer_setup(void) { int timer, ret; u16 val; -- cgit v1.1 From f087515c658a68454d43909d482ea4b59e7d6d5c Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: Use "just-in-time" detection for the MFGPT timers There isn't much value to always detecting the MFGPT timers on Geode platforms; detection is only needed when something wants to use the timers. Move the detection code so that it gets called the first time a timer is allocated. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/geode_32.c | 5 +---- arch/x86/kernel/mfgpt_32.c | 39 +++++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index 9c7f7d3..9dad6ca 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -163,14 +163,11 @@ EXPORT_SYMBOL_GPL(geode_gpio_setup_event); static int __init geode_southbridge_init(void) { - int timers; - if (!is_geode()) return -ENODEV; init_lbars(); - timers = geode_mfgpt_detect(); - printk(KERN_INFO "geode: %d MFGPT timers available.\n", timers); + (void) mfgpt_timer_setup(); return 0; } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 5cf3a83..abdb7c7 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -74,28 +74,37 @@ __setup("mfgptfix", mfgpt_fix); * In other cases (such as with VSAless OpenFirmware), the system firmware * leaves timers available for us to use. */ -int __init geode_mfgpt_detect(void) + + +static int timers = -1; + +static void geode_mfgpt_detect(void) { - int count = 0, i; + int i; u16 val; + timers = 0; + if (disable) { - printk(KERN_INFO "geode-mfgpt: Skipping MFGPT setup\n"); - return 0; + printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); + goto done; + } + + if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { + printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); + goto done; } for (i = 0; i < MFGPT_MAX_TIMERS; i++) { val = geode_mfgpt_read(i, MFGPT_REG_SETUP); if (!(val & MFGPT_SETUP_SETUP)) { mfgpt_timers[i].avail = 1; - count++; + timers++; } } - /* set up clock event device, if desired */ - i = mfgpt_timer_setup(); - - return count; +done: + printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); } int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) @@ -183,10 +192,16 @@ int geode_mfgpt_alloc_timer(int timer, int domain) { int i; - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) - return -ENODEV; + if (timers == -1) { + /* timers haven't been detected yet */ + geode_mfgpt_detect(); + } + + if (!timers) + return -1; + if (timer >= MFGPT_MAX_TIMERS) - return -EIO; + return -1; if (timer < 0) { /* Try to find an available timer */ -- cgit v1.1 From f54ae69bafa16434ce46bc2f1fe556bce4d23650 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: fix a potential race when disabling a timer We *really* don't want to be reading MFGPTx_SETUP and writing back those values. What we want to be doing is clearing CMP1 and CMP2 unconditionally; otherwise, we have races where CMP1 and/or CMP2 fire after we've read MFGPTx_SETUP. They can also fire between when we've written ~CNTEN to the register, and when the new register values get copied to the timer's version of the register. By clearing both fields, we're okay. Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index abdb7c7..81aa9db 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -249,8 +249,9 @@ __setup("mfgpt_irq=", mfgpt_setup); static void mfgpt_disable_timer(u16 clock) { - u16 val = geode_mfgpt_read(clock, MFGPT_REG_SETUP); - geode_mfgpt_write(clock, MFGPT_REG_SETUP, val & ~MFGPT_SETUP_CNTEN); + /* avoid races by clearing CMP1 and CMP2 unconditionally */ + geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | + MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); } static int mfgpt_next_event(unsigned long, struct clock_event_device *); -- cgit v1.1 From dcee77be2f0a7010633fb2c025db38550c4b0e72 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: make sure the right MFGPT timer fired the timer tick Each AMD Geode MFGPT timer interrupt output is paired with another timer; esentially the interrupt goes if either timer fires. This is okay, but the handlers need to be aware of this. Make sure in the timer tick handler that our timer really did expire. Signed-off-by: Jordan Crouse Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 81aa9db..eeb461f 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -293,10 +293,14 @@ static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) return 0; } -/* Assume (foolishly?), that this interrupt was due to our tick */ - static irqreturn_t mfgpt_tick(int irq, void *dev_id) { + u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); + + /* See if the interrupt was for us */ + if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) + return IRQ_NONE; + /* Turn off the clock (and clear the event) */ mfgpt_disable_timer(mfgpt_event_clock); -- cgit v1.1 From 3406c158ba8e83defb178e867919e24e110a59bf Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: GEODE: MFGPT: fix typo in printk in mfgpt_timer_setup Signed-off-by: Arnd Hannemann Signed-off-by: Andres Salomon Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mfgpt_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index eeb461f..027fc06 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -370,7 +370,7 @@ int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGT timer as a clock event.\n"); + "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); clockevents_register_device(&mfgpt_clockevent); return 0; -- cgit v1.1 From 88a5ac89667d22e1471ba1f45ea635df1f7da06f Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: fix sparse warning in xen/time.c Use xen_khz to denote xen_specific clock speed. Avoid shadowing cpu_khz. arch/x86/xen/time.c:220:6: warning: symbol 'cpu_khz' shadows an earlier one include/asm/tsc.h:17:21: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/xen/time.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3721fd..c39e1a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -217,17 +217,17 @@ unsigned long long xen_sched_clock(void) /* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { - u64 cpu_khz = 1000000ULL << 32; + u64 xen_khz = 1000000ULL << 32; const struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(cpu_khz, info->tsc_to_system_mul); + do_div(xen_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) - cpu_khz <<= -info->tsc_shift; + xen_khz <<= -info->tsc_shift; else - cpu_khz >>= info->tsc_shift; + xen_khz >>= info->tsc_shift; - return cpu_khz; + return xen_khz; } /* -- cgit v1.1 From 7c36752a6be84892afb085c67fd4209e686db482 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: sparse warning in therm_throt.c arch/x86/kernel/cpu/mcheck/therm_throt.c:121:2: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 24885be..9b7e01d 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -118,7 +118,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); } /* Mutex protecting device creation against CPU hotplug */ -- cgit v1.1 From da7bfc50f5cb54aeee8147dca0c1de9d487cb5e0 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: sparse warnings in pageattr.c Adjust the definition of lookup_address to take an unsigned long level argument. Adjust callers in xen/mmu.c that pass in a dummy variable. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 8 +++++--- arch/x86/xen/mmu.c | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 8493c85..eb2a544 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -191,7 +191,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) * or when the present bit is not set. Otherwise we would return a * pointer to a nonexisting mapping. */ -pte_t *lookup_address(unsigned long address, int *level) +pte_t *lookup_address(unsigned long address, unsigned int *level) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -255,7 +255,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, unsigned long nextpage_addr, numpages, pmask, psize, flags; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int level, do_split = 1; + int do_split = 1; + unsigned int level; spin_lock_irqsave(&pgd_lock, flags); /* @@ -406,7 +407,8 @@ out_unlock: static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { - int level, do_split, err; + int do_split, err; + unsigned int level; struct page *kpte_page; pte_t *kpte; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 45aa771..0144395 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,7 +58,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) { - int level; + unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & PAGE_MASK; @@ -71,7 +71,7 @@ void make_lowmem_page_readonly(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); @@ -86,7 +86,7 @@ void make_lowmem_page_readwrite(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; - int level; + unsigned int level; pte = lookup_address(address, &level); BUG_ON(pte == NULL); -- cgit v1.1 From 9583d050d5b7bad76423b2bd667b174a122067a7 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: fix sparse warning in topology.c arch/x86/kernel/topology.c:56:2: warning: returning void-valued expression Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index e6757aa..a40051b 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL(arch_register_cpu); void arch_unregister_cpu(int num) { - return unregister_cpu(&per_cpu(cpu_devices, num).cpu); + unregister_cpu(&per_cpu(cpu_devices, num).cpu); } EXPORT_SYMBOL(arch_unregister_cpu); #else -- cgit v1.1 From 3701d863b43d05ffeb223d269583398f914fb5d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 9 Feb 2008 23:24:08 +0100 Subject: x86: fixup more paravirt fallout Use a common irq_return entry point for all the iret places, which need the paravirt INTERRUPT return wrapper. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 15 ++++++--------- arch/x86/kernel/entry_64.S | 18 +++++++++++++----- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index be5c31d..824e21b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,8 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -1: INTERRUPT_RETURN +ENTRY(irq_return) + INTERRUPT_RETURN .section .fixup,"ax" iret_exc: pushl $0 # no error code @@ -418,7 +419,7 @@ iret_exc: .previous .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long irq_return,iret_exc .previous CFI_RESTORE_STATE @@ -865,20 +866,16 @@ nmi_espfix_stack: RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 -1: INTERRUPT_RETURN + jmp irq_return CFI_ENDPROC -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous KPROBE_END(nmi) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) -1: iret + iret .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long native_iret, iret_exc .previous END(native_iret) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7341e8..6be39a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -581,16 +581,24 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 0,8,0 -#ifdef CONFIG_PARAVIRT + RESTORE_ARGS 0,8,0 + +ENTRY(irq_return) INTERRUPT_RETURN -#endif + + .section __ex_table, "a" + .quad irq_return, bad_iret + .previous + +#ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq .section __ex_table,"a" .quad native_iret, bad_iret .previous +#endif + .section .fixup,"ax" bad_iret: /* @@ -804,7 +812,7 @@ paranoid_swapgs\trace: SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - INTERRUPT_RETURN + jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -919,7 +927,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq native_iret(%rip),%rbp + leaq irq_return(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ -- cgit v1.1 From bfc734b24671b2639218ae2ef53af91dfd30b6c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: avoid unused variable warning in mm/init_64.c Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fe880f..620d2b6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -532,9 +532,9 @@ void __init mem_init(void) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; + unsigned long addr = begin; - if (begin >= end) + if (addr >= end) return; /* @@ -549,7 +549,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) #else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), -- cgit v1.1 From 185c045c245f46485ad8bbd8cc1100e986ff3f13 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86, core: remove CONFIG_FORCED_INLINING Other than the defconfigs, remove the entry in compiler-gcc4.h, Kconfig.debug and feature-removal-schedule.txt. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 77562e7..3df340b 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1421,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9e2b0ef..eef98cb 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1346,7 +1346,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set -- cgit v1.1 From 551889a6e2a24a9c06fd453ea03b57b7746ffdc0 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: construct 32-bit boot time page tables in native format. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specifically the boot time page tables in a CONFIG_X86_PAE=y enabled kernel are in PAE format. early_ioremap is updated to use the standard page table accessors. Clear any mappings beyond max_low_pfn from the boot page tables in native_pagetable_setup_start because the initial mappings can extend beyond the range of physical memory and into the vmalloc area. Derived from patches by Eric Biederman and H. Peter Anvin. [ jeremy@goop.org: PAE swapper_pg_dir needs to be page-sized fix ] Signed-off-by: Ian Campbell Cc: H. Peter Anvin Cc: Eric W. Biederman Cc: Andi Kleen Cc: Mika Penttilä Cc: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head_32.S | 151 ++++++++++++++++++++++++++++++++++----------- arch/x86/kernel/setup_32.c | 4 ++ arch/x86/mm/init_32.c | 72 ++++++++------------- arch/x86/mm/ioremap.c | 55 ++++++++++------- 4 files changed, 178 insertions(+), 104 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 5d8c573..74ef4a4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,6 +19,10 @@ #include #include #include +#include + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) /* * References to members of the new_cpu_data structure. @@ -80,10 +84,6 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_ */ .section .text.head,"ax",@progbits ENTRY(startup_32) - /* check to see if KEEP_SEGMENTS flag is meaningful */ - cmpw $0x207, BP_version(%esi) - jb 1f - /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ testb $(1<<6), BP_loadflags(%esi) @@ -92,7 +92,7 @@ ENTRY(startup_32) /* * Set segments to known values. */ -1: lgdt boot_gdt_descr - __PAGE_OFFSET + lgdt pa(boot_gdt_descr) movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es @@ -105,8 +105,8 @@ ENTRY(startup_32) */ cld xorl %eax,%eax - movl $__bss_start - __PAGE_OFFSET,%edi - movl $__bss_stop - __PAGE_OFFSET,%ecx + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx rep ; stosl @@ -118,31 +118,32 @@ ENTRY(startup_32) * (kexec on panic case). Hence copy out the parameters before initializing * page tables. */ - movl $(boot_params - __PAGE_OFFSET),%edi + movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld rep movsl - movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi + movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No comand line - movl $(boot_command_line - __PAGE_OFFSET),%edi + movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx rep movsl 1: #ifdef CONFIG_PARAVIRT - cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET) + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) jb default_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ - movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax + movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax jae bad_subarch - movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax + movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax @@ -170,17 +171,68 @@ num_subarch_entries = (. - subarch_entries) / 4 * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. * - * Warning: don't use %esi or the stack in this code. However, %esp - * can be used as a GPR if you really need it... + * Note that the stack is not yet set up! */ -page_pde_offset = (__PAGE_OFFSET >> 20); +#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ +#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ default_entry: - movl $(pg0 - __PAGE_OFFSET), %edi - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode swapper_pg_dir is statically defined to contain enough + * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD + * entries to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at + * this stage. + */ + +#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(pg0), %edi + movl $pa(swapper_pg_pmd), %edx + movl $PTE_ATTR, %eax +10: + leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables. + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + cmpl %ebp,%eax + jb 10b +1: + movl %edi,pa(init_pg_tables_end) + + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(pg0), %edi + movl $pa(swapper_pg_dir), %edx + movl $PTE_ATTR, %eax 10: - leal 0x007(%edi),%ecx /* Create PDE entry */ + leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -189,19 +241,20 @@ default_entry: stosl addl $0x1000,%eax loop 11b - /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ - /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ - leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + /* + * End condition: we must map up to and including INIT_MAP_BEYOND_END + * bytes beyond the end of our own page tables; the +0x007 is + * the attribute bits + */ + leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b - movl %edi,(init_pg_tables_end - __PAGE_OFFSET) - - /* Do an early initialization of the fixmap area */ - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax - addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ - movl %eax, 4092(%edx) + movl %edi,pa(init_pg_tables_end) + /* Do early initialization of the fixmap area */ + movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl %eax,pa(swapper_pg_dir+0xffc) +#endif jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -241,7 +294,7 @@ ENTRY(startup_32_smp) * NOTE! We have to correct for the fact that we're * not yet offset PAGE_OFFSET.. */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET +#define cr4_bits pa(mmu_cr4_features) movl cr4_bits,%edx andl %edx,%edx jz 6f @@ -276,10 +329,10 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $swapper_pg_dir-__PAGE_OFFSET,%eax + movl $pa(swapper_pg_dir),%eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax - orl $0x80000000,%eax + orl $X86_CR0_PG,%eax movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: @@ -552,16 +605,44 @@ ENTRY(_stext) */ .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm +#ifdef CONFIG_X86_PAE +ENTRY(swapper_pg_pmd) + .fill 1024*KPMDS,4,0 +#else ENTRY(swapper_pg_dir) .fill 1024,4,0 -ENTRY(swapper_pg_pmd) +#endif +ENTRY(swapper_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 - /* * This starts the data section. */ +#ifdef CONFIG_X86_PAE +.section ".data.page_aligned","wa" + /* Page-aligned for the benefit of paravirt? */ + .align PAGE_SIZE_asm +ENTRY(swapper_pg_dir) + .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(swapper_pg_pmd+PGD_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE_asm /* needs to be page-sized too */ +#endif + .data ENTRY(stack_start) .long init_thread_union+THREAD_SIZE diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index d1d8c34..691ab4c 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -154,7 +154,11 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; EXPORT_SYMBOL(boot_cpu_data); +#ifndef CONFIG_X86_PAE unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d1bc040..54aba3c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -46,6 +46,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -328,44 +329,38 @@ pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { -#ifdef CONFIG_X86_PAE - int i; + unsigned long pfn, va; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; /* - * Init entries of the first-level page table to the - * zero page, if they haven't already been set up. - * - * In a normal native boot, we'll be running on a - * pagetable rooted in swapper_pg_dir, but not in PAE - * mode, so this will end up clobbering the mappings - * for the lower 24Mbytes of the address space, - * without affecting the kernel address space. + * Remove any mappings which extend past the end of physical + * memory from the boot time page table: */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) - set_pgd(&base[i], - __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); - - /* Make sure kernel address space is empty so that a pagetable - will be allocated for it. */ - memset(&base[USER_PTRS_PER_PGD], 0, - KERNEL_PGD_PTRS * sizeof(pgd_t)); -#else + for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<> PAGE_SHIFT); -#endif } void __init native_pagetable_setup_done(pgd_t *base) { -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&base[0], base[USER_PTRS_PER_PGD]); -#endif } /* @@ -374,9 +369,8 @@ void __init native_pagetable_setup_done(pgd_t *base) * the boot process. * * If we're booting on native hardware, this will be a pagetable - * constructed in arch/i386/kernel/head.S, and not running in PAE mode - * (even if we'll end up running in PAE). The root of the pagetable - * will be swapper_pg_dir. + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. * * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may @@ -537,14 +531,6 @@ void __init paging_init(void) load_cr3(swapper_pg_dir); -#ifdef CONFIG_X86_PAE - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif __flush_tlb_all(); kmap_init(); @@ -675,10 +661,6 @@ void __init mem_init(void) BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index ee6648f..1106b7f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -260,41 +260,46 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static __initdata unsigned long bm_pte[1024] +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __attribute__((aligned(PAGE_SIZE))); -static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { - return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); + pgd_t *pgd = &swapper_pg_dir[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; } -static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +static inline pte_t * __init early_ioremap_pte(unsigned long addr) { - return bm_pte + ((addr >> PAGE_SHIFT) & 1023); + return &bm_pte[pte_index(addr)]; } void __init early_ioremap_init(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = __pa(bm_pte) | _PAGE_TABLE; + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); + set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE)); + /* - * The boot-ioremap range spans multiple pgds, for which + * The boot-ioremap range spans multiple pmds, for which * we are not prepared: */ - if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); - printk(KERN_WARNING "pgd %p != %p\n", - pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", - fix_to_virt(FIX_BTMAP_BEGIN)); + fix_to_virt(FIX_BTMAP_BEGIN)); printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", - fix_to_virt(FIX_BTMAP_END)); + fix_to_virt(FIX_BTMAP_END)); printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", @@ -304,28 +309,29 @@ void __init early_ioremap_init(void) void __init early_ioremap_clear(void) { - unsigned long *pgd; + pmd_t *pmd; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_clear()\n"); - pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); - *pgd = 0; - paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + pmd_clear(pmd); + paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT); __flush_tlb_all(); } void __init early_ioremap_reset(void) { enum fixed_addresses idx; - unsigned long *pte, phys, addr; + unsigned long addr, phys; + pte_t *pte; after_paging_init = 1; for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { addr = fix_to_virt(idx); pte = early_ioremap_pte(addr); - if (*pte & _PAGE_PRESENT) { - phys = *pte & PAGE_MASK; + if (pte_present(*pte)) { + phys = pte_val(*pte) & PAGE_MASK; set_fixmap(idx, phys); } } @@ -334,7 +340,8 @@ void __init early_ioremap_reset(void) static void __init __early_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { - unsigned long *pte, addr = __fix_to_virt(idx); + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; if (idx >= __end_of_fixed_addresses) { BUG(); @@ -342,9 +349,9 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } pte = early_ioremap_pte(addr); if (pgprot_val(flags)) - *pte = (phys & PAGE_MASK) | pgprot_val(flags); + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else - *pte = 0; + pte_clear(NULL, addr, pte); __flush_tlb_one(addr); } -- cgit v1.1 From b6fbb669c8ef3a112121697ca901c290ccd35eb2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: fix early_ioremap pagetable ops Some important parts of f6df72e71eba621b2f5c49b3a763116fac748f6e got dropped along the way, reintroduce them. Only affects paravirt guests. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1106b7f..a4897a8 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -286,7 +286,7 @@ void __init early_ioremap_init(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); - set_pmd(pmd, __pmd(__pa(bm_pte) | _PAGE_TABLE)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which @@ -316,7 +316,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(pmd) >> PAGE_SHIFT); + paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } -- cgit v1.1 From 9b706aee7d92d6ac3002547aea12e3eaa0a750ae Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: trivial printk optimizations In arch/x86/boot/printf.c gets rid of unused tail of digits: const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; (we are using 0-9a-f only) Uses smaller/faster lowercasing (by ORing with 0x20) if we know that we work on numbers/digits. Makes strtoul smaller, and also we are getting rid of static const char small_digits[] = "0123456789abcdefx"; static const char large_digits[] = "0123456789ABCDEFX"; since this works equally well: static const char digits[16] = "0123456789ABCDEF"; Size savings: $ size vmlinux.org vmlinux text data bss dec hex filename 877320 112252 90112 1079684 107984 vmlinux.org 877048 112252 90112 1079412 107874 vmlinux It may be also a tiny bit faster because code has less branches now, but I doubt it is measurable. [ hugh@veritas.com: uppercase pointers fix ] Signed-off-by: Denys Vlasenko Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/printf.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 1a09f93..7e7e890 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -33,8 +33,8 @@ static int skip_atoi(const char **s) #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ -#define SPECIAL 32 /* 0x */ -#define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ #define do_div(n,base) ({ \ int __res; \ @@ -45,12 +45,16 @@ __res; }) static char *number(char *str, long num, int base, int size, int precision, int type) { - char c, sign, tmp[66]; - const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz"; + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + char tmp[66]; + char c, sign, locase; int i; - if (type & LARGE) - digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + /* locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters */ + locase = (type & SMALL); if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) @@ -81,7 +85,7 @@ static char *number(char *str, long num, int base, int size, int precision, tmp[i++] = '0'; else while (num != 0) - tmp[i++] = digits[do_div(num, base)]; + tmp[i++] = (digits[do_div(num, base)] | locase); if (i > precision) precision = i; size -= precision; @@ -95,7 +99,7 @@ static char *number(char *str, long num, int base, int size, int precision, *str++ = '0'; else if (base == 16) { *str++ = '0'; - *str++ = digits[33]; + *str++ = ('X' | locase); } } if (!(type & LEFT)) @@ -244,9 +248,9 @@ int vsprintf(char *buf, const char *fmt, va_list args) base = 8; break; - case 'X': - flags |= LARGE; case 'x': + flags |= SMALL; + case 'X': base = 16; break; -- cgit v1.1 From cf7700fe24301df2c8d3636cf40784651c098207 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86 PM: move 64-bit hibernation files to arch/x86/power Move arch/x86/kernel/suspend_64.c to arch/x86/power . Move arch/x86/kernel/suspend_asm_64.S to arch/x86/power as hibernate_asm_64.S . Update purpose and copyright information in arch/x86/power/suspend_64.c and arch/x86/power/hibernate_asm_64.S . Update the Makefiles in arch/x86, arch/x86/kernel and arch/x86/power to reflect the above changes. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Makefile | 4 +- arch/x86/kernel/Makefile | 2 - arch/x86/kernel/suspend_64.c | 320 ------------------------------------- arch/x86/kernel/suspend_asm_64.S | 141 ----------------- arch/x86/power/Makefile | 9 +- arch/x86/power/hibernate_asm_64.S | 146 +++++++++++++++++ arch/x86/power/suspend_64.c | 321 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 477 insertions(+), 466 deletions(-) delete mode 100644 arch/x86/kernel/suspend_64.c delete mode 100644 arch/x86/kernel/suspend_asm_64.S create mode 100644 arch/x86/power/hibernate_asm_64.S create mode 100644 arch/x86/power/suspend_64.c (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 364865b..204af43 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -191,8 +191,10 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -ifeq ($(CONFIG_X86_32),y) +# suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ + +ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 21dc1a0..76ec0f8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -84,8 +84,6 @@ ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_PM) += suspend_64.o - obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c deleted file mode 100644 index 7ac7130..0000000 --- a/arch/x86/kernel/suspend_64.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Suspend support specific for i386. - * - * Distribute under GPLv2 - * - * Copyright (c) 2002 Pavel Machek - * Copyright (c) 2001 Patrick Mochel - */ - -#include -#include -#include -#include -#include -#include - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -static void fix_processor_context(void); - -struct saved_context saved_context; - -/** - * __save_processor_state - save CPU registers before creating a - * hibernation image and before restoring the memory state from it - * @ctxt - structure to store the registers contents in - * - * NOTE: If there is a CPU register the modification of which by the - * boot kernel (ie. the kernel used for loading the hibernation image) - * might affect the operations of the restored target kernel (ie. the one - * saved in the hibernation image), then its contents must be saved by this - * function. In other words, if kernel A is hibernated and different - * kernel B is used for loading the hibernation image into memory, the - * kernel A's __save_processor_state() function must save all registers - * needed by kernel A, so that it can operate correctly after the resume - * regardless of what kernel B does in the meantime. - */ -static void __save_processor_state(struct saved_context *ctxt) -{ - kernel_fpu_begin(); - - /* - * descriptor tables - */ - store_gdt((struct desc_ptr *)&ctxt->gdt_limit); - store_idt((struct desc_ptr *)&ctxt->idt_limit); - store_tr(ctxt->tr); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* - * segment registers - */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - mtrr_save_fixed_ranges(NULL); - - /* - * control registers - */ - rdmsrl(MSR_EFER, ctxt->efer); - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); - ctxt->cr8 = read_cr8(); -} - -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary - */ - kernel_fpu_end(); -} - -/** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from - */ -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); - load_idt((const struct desc_ptr *)&ctxt->idt_limit); - - - /* - * segment registers - */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); -} - -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - /* - * This just modifies memory; should not be necessary. But... This - * is necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - set_tss_desc(cpu, t); - - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } -} - -#ifdef CONFIG_HIBERNATION -/* Defined in arch/x86_64/kernel/suspend_asm.S */ -extern int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3; - -pgd_t *temp_level4_pgt; - -void *relocated_restore_code; - -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; -} - -static int set_up_temporary_mappings(void) -{ - unsigned long start, end, next; - int error; - - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) - return -ENOMEM; - - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - - /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); - } - return 0; -} - -int swsusp_arch_resume(void) -{ - int error; - - /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) - return error; - - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); - - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -struct restore_data_record { - unsigned long jump_address; - unsigned long cr3; - unsigned long magic; -}; - -#define RESTORE_MAGIC 0x0123456789ABCDEFUL - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = restore_jump_address; - rdr->cr3 = restore_cr3; - rdr->magic = RESTORE_MAGIC; - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; -} -#endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S deleted file mode 100644 index aeb9a4d..0000000 --- a/arch/x86/kernel/suspend_asm_64.S +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright 2004,2005 Pavel Machek , Andi Kleen , Rafael J. Wysocki - * - * Distribute under GPLv2. - * - * swsusp_arch_resume must not use any stack or any nonlocal variables while - * copying pages: - * - * Its rewriting one kernel image with another. What is stack in "old" - * image could very well be data page in "new" image, and overwriting - * your own stack under you is bad idea. - */ - - .text -#include -#include -#include -#include - -ENTRY(swsusp_arch_suspend) - movq $saved_context, %rax - movq %rsp, pt_regs_sp(%rax) - movq %rbp, pt_regs_bp(%rax) - movq %rsi, pt_regs_si(%rax) - movq %rdi, pt_regs_di(%rax) - movq %rbx, pt_regs_bx(%rax) - movq %rcx, pt_regs_cx(%rax) - movq %rdx, pt_regs_dx(%rax) - movq %r8, pt_regs_r8(%rax) - movq %r9, pt_regs_r9(%rax) - movq %r10, pt_regs_r10(%rax) - movq %r11, pt_regs_r11(%rax) - movq %r12, pt_regs_r12(%rax) - movq %r13, pt_regs_r13(%rax) - movq %r14, pt_regs_r14(%rax) - movq %r15, pt_regs_r15(%rax) - pushfq - popq pt_regs_flags(%rax) - - /* save the address of restore_registers */ - movq $restore_registers, %rax - movq %rax, restore_jump_address(%rip) - /* save cr3 */ - movq %cr3, %rax - movq %rax, restore_cr3(%rip) - - call swsusp_save - ret - -ENTRY(restore_image) - /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rdx - movq temp_level4_pgt(%rip), %rax - subq %rdx, %rax - movq %rax, %cr3 - /* Flush TLB */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3; - movq %rax, %cr4; # turn PGE back on - - /* prepare to jump to the image kernel */ - movq restore_jump_address(%rip), %rax - movq restore_cr3(%rip), %rbx - - /* prepare to copy image data to their original locations */ - movq restore_pblist(%rip), %rdx - movq relocated_restore_code(%rip), %rcx - jmpq *%rcx - - /* code below has been relocated to a safe page */ -ENTRY(core_restore_code) -loop: - testq %rdx, %rdx - jz done - - /* get addresses from the pbe and copy the page */ - movq pbe_address(%rdx), %rsi - movq pbe_orig_address(%rdx), %rdi - movq $(PAGE_SIZE >> 3), %rcx - rep - movsq - - /* progress to the next pbe */ - movq pbe_next(%rdx), %rdx - jmp loop -done: - /* jump to the restore_registers address from the image header */ - jmpq *%rax - /* - * NOTE: This assumes that the boot kernel's text mapping covers the - * image kernel's page containing restore_registers and the address of - * this page is the same as in the image kernel's text mapping (it - * should always be true, because the text mapping is linear, starting - * from 0, and is supposed to cover the entire kernel text for every - * kernel). - * - * code below belongs to the image kernel - */ - -ENTRY(restore_registers) - /* go back to the original page tables */ - movq %rbx, %cr3 - - /* Flush TLB, including "global" things (vmalloc) */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx; # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3 - movq %rax, %cr4; # turn PGE back on - - /* We don't restore %rax, it must be 0 anyway */ - movq $saved_context, %rax - movq pt_regs_sp(%rax), %rsp - movq pt_regs_bp(%rax), %rbp - movq pt_regs_si(%rax), %rsi - movq pt_regs_di(%rax), %rdi - movq pt_regs_bx(%rax), %rbx - movq pt_regs_cx(%rax), %rcx - movq pt_regs_dx(%rax), %rdx - movq pt_regs_r8(%rax), %r8 - movq pt_regs_r9(%rax), %r9 - movq pt_regs_r10(%rax), %r10 - movq pt_regs_r11(%rax), %r11 - movq pt_regs_r12(%rax), %r12 - movq pt_regs_r13(%rax), %r13 - movq pt_regs_r14(%rax), %r14 - movq pt_regs_r15(%rax), %r15 - pushq pt_regs_flags(%rax) - popfq - - xorq %rax, %rax - - /* tell the hibernation core that we've just restored the memory */ - movq %rax, in_suspend(%rip) - - ret diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index d764ec9..8ce87fb 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,2 +1,7 @@ -obj-$(CONFIG_PM) += cpu.o -obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_PM) += suspend_64.o + obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o +else + obj-$(CONFIG_PM) += cpu.o + obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +endif diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S new file mode 100644 index 0000000..1deb3244 --- /dev/null +++ b/arch/x86/power/hibernate_asm_64.S @@ -0,0 +1,146 @@ +/* + * Hibernation support for x86-64 + * + * Distribute under GPLv2. + * + * Copyright 2007 Rafael J. Wysocki + * Copyright 2005 Andi Kleen + * Copyright 2004 Pavel Machek + * + * swsusp_arch_resume must not use any stack or any nonlocal variables while + * copying pages: + * + * Its rewriting one kernel image with another. What is stack in "old" + * image could very well be data page in "new" image, and overwriting + * your own stack under you is bad idea. + */ + + .text +#include +#include +#include +#include + +ENTRY(swsusp_arch_suspend) + movq $saved_context, %rax + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) + movq %r8, pt_regs_r8(%rax) + movq %r9, pt_regs_r9(%rax) + movq %r10, pt_regs_r10(%rax) + movq %r11, pt_regs_r11(%rax) + movq %r12, pt_regs_r12(%rax) + movq %r13, pt_regs_r13(%rax) + movq %r14, pt_regs_r14(%rax) + movq %r15, pt_regs_r15(%rax) + pushfq + popq pt_regs_flags(%rax) + + /* save the address of restore_registers */ + movq $restore_registers, %rax + movq %rax, restore_jump_address(%rip) + /* save cr3 */ + movq %cr3, %rax + movq %rax, restore_cr3(%rip) + + call swsusp_save + ret + +ENTRY(restore_image) + /* switch to temporary page tables */ + movq $__PAGE_OFFSET, %rdx + movq temp_level4_pgt(%rip), %rax + subq %rdx, %rax + movq %rax, %cr3 + /* Flush TLB */ + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(1<<7), %rdx # PGE + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3; + movq %rax, %cr4; # turn PGE back on + + /* prepare to jump to the image kernel */ + movq restore_jump_address(%rip), %rax + movq restore_cr3(%rip), %rbx + + /* prepare to copy image data to their original locations */ + movq restore_pblist(%rip), %rdx + movq relocated_restore_code(%rip), %rcx + jmpq *%rcx + + /* code below has been relocated to a safe page */ +ENTRY(core_restore_code) +loop: + testq %rdx, %rdx + jz done + + /* get addresses from the pbe and copy the page */ + movq pbe_address(%rdx), %rsi + movq pbe_orig_address(%rdx), %rdi + movq $(PAGE_SIZE >> 3), %rcx + rep + movsq + + /* progress to the next pbe */ + movq pbe_next(%rdx), %rdx + jmp loop +done: + /* jump to the restore_registers address from the image header */ + jmpq *%rax + /* + * NOTE: This assumes that the boot kernel's text mapping covers the + * image kernel's page containing restore_registers and the address of + * this page is the same as in the image kernel's text mapping (it + * should always be true, because the text mapping is linear, starting + * from 0, and is supposed to cover the entire kernel text for every + * kernel). + * + * code below belongs to the image kernel + */ + +ENTRY(restore_registers) + /* go back to the original page tables */ + movq %rbx, %cr3 + + /* Flush TLB, including "global" things (vmalloc) */ + movq mmu_cr4_features(%rip), %rax + movq %rax, %rdx + andq $~(1<<7), %rdx; # PGE + movq %rdx, %cr4; # turn off PGE + movq %cr3, %rcx; # flush TLB + movq %rcx, %cr3 + movq %rax, %cr4; # turn PGE back on + + /* We don't restore %rax, it must be 0 anyway */ + movq $saved_context, %rax + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx + movq pt_regs_r8(%rax), %r8 + movq pt_regs_r9(%rax), %r9 + movq pt_regs_r10(%rax), %r10 + movq pt_regs_r11(%rax), %r11 + movq pt_regs_r12(%rax), %r12 + movq pt_regs_r13(%rax), %r13 + movq pt_regs_r14(%rax), %r14 + movq pt_regs_r15(%rax), %r15 + pushq pt_regs_flags(%rax) + popfq + + xorq %rax, %rax + + /* tell the hibernation core that we've just restored the memory */ + movq %rax, in_suspend(%rip) + + ret diff --git a/arch/x86/power/suspend_64.c b/arch/x86/power/suspend_64.c new file mode 100644 index 0000000..d51dbf2 --- /dev/null +++ b/arch/x86/power/suspend_64.c @@ -0,0 +1,321 @@ +/* + * Suspend and hibernation support for x86-64 + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include +#include +#include +#include +#include + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +static void fix_processor_context(void); + +struct saved_context saved_context; + +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + store_gdt((struct desc_ptr *)&ctxt->gdt_limit); + store_idt((struct desc_ptr *)&ctxt->idt_limit); + store_tr(ctxt->tr); + + /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ + /* + * segment registers + */ + asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + rdmsrl(MSR_FS_BASE, ctxt->fs_base); + rdmsrl(MSR_GS_BASE, ctxt->gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); + + /* + * control registers + */ + rdmsrl(MSR_EFER, ctxt->efer); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); +} + +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} + +static void do_fpu_end(void) +{ + /* + * Restore FPU regs if necessary + */ + kernel_fpu_end(); +} + +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) +{ + /* + * control registers + */ + wrmsrl(MSR_EFER, ctxt->efer); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); + load_idt((const struct desc_ptr *)&ctxt->idt_limit); + + + /* + * segment registers + */ + asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + load_gs_index(ctxt->gs); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + wrmsrl(MSR_FS_BASE, ctxt->fs_base); + wrmsrl(MSR_GS_BASE, ctxt->gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + + fix_processor_context(); + + do_fpu_end(); + mtrr_ap_init(); +} + +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} + +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + /* + * This just modifies memory; should not be necessary. But... This + * is necessary, because 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + set_tss_desc(cpu, t); + + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7){ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); + } +} + +#ifdef CONFIG_HIBERNATION +/* Defined in arch/x86_64/kernel/suspend_asm.S */ +extern int restore_image(void); + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + +pgd_t *temp_level4_pgt; + +void *relocated_restore_code; + +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +int swsusp_arch_resume(void) +{ + int error; + + /* We have got enough memory and from now on we cannot recover */ + if ((error = set_up_temporary_mappings())) + return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + + restore_image(); + return 0; +} + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} +#endif /* CONFIG_HIBERNATION */ -- cgit v1.1 From c57591244a08bb441c83472f5c110151bb7c2cc6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86 PM: rename 32-bit files in arch/x86/power Rename cpu.c, suspend.c and swsusp.S in arch/x86/power to cpu_32.c, hibernate_32.c and hibernate_asm_32.S, respectively, and update the purpose and copyright information in these files. Update the Makefile in arch/x86/power to reflect the above changes. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/Makefile | 4 +- arch/x86/power/cpu.c | 133 ----------------------------- arch/x86/power/cpu_32.c | 133 +++++++++++++++++++++++++++++ arch/x86/power/hibernate_32.c | 172 ++++++++++++++++++++++++++++++++++++++ arch/x86/power/hibernate_asm_32.S | 77 +++++++++++++++++ arch/x86/power/suspend.c | 172 -------------------------------------- arch/x86/power/swsusp.S | 78 ----------------- 7 files changed, 384 insertions(+), 385 deletions(-) delete mode 100644 arch/x86/power/cpu.c create mode 100644 arch/x86/power/cpu_32.c create mode 100644 arch/x86/power/hibernate_32.c create mode 100644 arch/x86/power/hibernate_asm_32.S delete mode 100644 arch/x86/power/suspend.c delete mode 100644 arch/x86/power/swsusp.S (limited to 'arch/x86') diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 8ce87fb..2c95118 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -2,6 +2,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PM) += suspend_64.o obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o else - obj-$(CONFIG_PM) += cpu.o - obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o + obj-$(CONFIG_PM) += cpu_32.o + obj-$(CONFIG_HIBERNATION) += hibernate_32.o hibernate_asm_32.o endif diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c deleted file mode 100644 index efcf620..0000000 --- a/arch/x86/power/cpu.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Suspend support specific for i386. - * - * Distribute under GPLv2 - * - * Copyright (c) 2002 Pavel Machek - * Copyright (c) 2001 Patrick Mochel - */ - -#include -#include -#include -#include - -static struct saved_context saved_context; - -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; - -static void __save_processor_state(struct saved_context *ctxt) -{ - mtrr_save_fixed_ranges(NULL); - kernel_fpu_begin(); - - /* - * descriptor tables - */ - store_gdt(&ctxt->gdt); - store_idt(&ctxt->idt); - store_tr(ctxt->tr); - - /* - * segment registers - */ - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); - savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); - - /* - * control registers - */ - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); -} - -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary. - */ - kernel_fpu_end(); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct * t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } - -} - -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - write_cr4(ctxt->cr4); - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - load_gdt(&ctxt->gdt); - load_idt(&ctxt->idt); - - /* - * segment registers - */ - loadsegment(es, ctxt->es); - loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); - - fix_processor_context(); - do_fpu_end(); - mtrr_ap_init(); - mcheck_init(&boot_cpu_data); -} - -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} - -/* Needed by apm.c */ -EXPORT_SYMBOL(save_processor_state); -EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c new file mode 100644 index 0000000..7f9c6da --- /dev/null +++ b/arch/x86/power/cpu_32.c @@ -0,0 +1,133 @@ +/* + * Suspend support specific for i386. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include +#include +#include + +static struct saved_context saved_context; + +unsigned long saved_context_ebx; +unsigned long saved_context_esp, saved_context_ebp; +unsigned long saved_context_esi, saved_context_edi; +unsigned long saved_context_eflags; + +static void __save_processor_state(struct saved_context *ctxt) +{ + mtrr_save_fixed_ranges(NULL); + kernel_fpu_begin(); + + /* + * descriptor tables + */ + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); + store_tr(ctxt->tr); + + /* + * segment registers + */ + savesegment(es, ctxt->es); + savesegment(fs, ctxt->fs); + savesegment(gs, ctxt->gs); + savesegment(ss, ctxt->ss); + + /* + * control registers + */ + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); +} + +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} + +static void do_fpu_end(void) +{ + /* + * Restore FPU regs if necessary. + */ + kernel_fpu_end(); +} + +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct * t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ + + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); + } + +} + +static void __restore_processor_state(struct saved_context *ctxt) +{ + /* + * control registers + */ + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); + + /* + * segment registers + */ + loadsegment(es, ctxt->es); + loadsegment(fs, ctxt->fs); + loadsegment(gs, ctxt->gs); + loadsegment(ss, ctxt->ss); + + /* + * sysenter MSRs + */ + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); + + fix_processor_context(); + do_fpu_end(); + mtrr_ap_init(); + mcheck_init(&boot_cpu_data); +} + +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} + +/* Needed by apm.c */ +EXPORT_SYMBOL(save_processor_state); +EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c new file mode 100644 index 0000000..5080c37 --- /dev/null +++ b/arch/x86/power/hibernate_32.c @@ -0,0 +1,172 @@ +/* + * Hibernation support specific for i386 - temporary page tables + * + * Distribute under GPLv2 + * + * Copyright (c) 2006 Rafael J. Wysocki + */ + +#include +#include + +#include +#include +#include + +/* Defined in arch/i386/power/swsusp.S */ +extern int restore_image(void); + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* Pointer to the temporary resume page tables */ +pgd_t *resume_pg_dir; + +/* The following three functions are based on the analogous code in + * arch/i386/mm/init.c + */ + +/* + * Create a middle page table on a resume-safe page and put a pointer to it in + * the given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t *resume_one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd_table) + return NULL; + + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + + BUG_ON(pmd_table != pmd_offset(pud, 0)); +#else + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +#endif + + return pmd_table; +} + +/* + * Create a page table on a resume-safe page and place a pointer to it in + * a middle page directory entry. + */ +static pte_t *resume_one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!page_table) + return NULL; + + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + + return page_table; + } + + return pte_offset_kernel(pmd, 0); +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET. The page tables are allocated out of resume-safe pages. + */ +static int resume_physical_mapping_init(pgd_t *pgd_base) +{ + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int pgd_idx, pmd_idx; + + pgd_idx = pgd_index(PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + pfn = 0; + + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = resume_one_md_table_init(pgd); + if (!pmd) + return -ENOMEM; + + if (pfn >= max_low_pfn) + continue; + + for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) { + if (pfn >= max_low_pfn) + break; + + /* Map with big pages if possible, otherwise create + * normal page tables. + * NOTE: We can mark everything as executable here + */ + if (cpu_has_pse) { + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + pfn += PTRS_PER_PTE; + } else { + pte_t *max_pte; + + pte = resume_one_page_table_init(pmd); + if (!pte) + return -ENOMEM; + + max_pte = pte + PTRS_PER_PTE; + for (; pte < max_pte; pte++, pfn++) { + if (pfn >= max_low_pfn) + break; + + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + } + } + } + } + return 0; +} + +static inline void resume_init_first_level_page_table(pgd_t *pg_dir) +{ +#ifdef CONFIG_X86_PAE + int i; + + /* Init entries of the first-level page table to the zero page */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(pg_dir + i, + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#endif +} + +int swsusp_arch_resume(void) +{ + int error; + + resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!resume_pg_dir) + return -ENOMEM; + + resume_init_first_level_page_table(resume_pg_dir); + error = resume_physical_mapping_init(resume_pg_dir); + if (error) + return error; + + /* We have got enough memory and from now on we cannot recover */ + restore_image(); + return 0; +} + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S new file mode 100644 index 0000000..b95aa6c --- /dev/null +++ b/arch/x86/power/hibernate_asm_32.S @@ -0,0 +1,77 @@ +.text + +/* + * This may not use any stack, nor any variable that is not "NoSave": + * + * Its rewriting one kernel image with another. What is stack in "old" + * image could very well be data page in "new" image, and overwriting + * your own stack under you is bad idea. + */ + +#include +#include +#include +#include + + .text + +ENTRY(swsusp_arch_suspend) + + movl %esp, saved_context_esp + movl %ebx, saved_context_ebx + movl %ebp, saved_context_ebp + movl %esi, saved_context_esi + movl %edi, saved_context_edi + pushfl ; popl saved_context_eflags + + call swsusp_save + ret + +ENTRY(restore_image) + movl resume_pg_dir, %ecx + subl $__PAGE_OFFSET, %ecx + movl %ecx, %cr3 + + movl restore_pblist, %edx + .p2align 4,,7 + +copy_loop: + testl %edx, %edx + jz done + + movl pbe_address(%edx), %esi + movl pbe_orig_address(%edx), %edi + + movl $1024, %ecx + rep + movsl + + movl pbe_next(%edx), %edx + jmp copy_loop + .p2align 4,,7 + +done: + /* go back to the original page tables */ + movl $swapper_pg_dir, %ecx + subl $__PAGE_OFFSET, %ecx + movl %ecx, %cr3 + /* Flush TLB, including "global" things (vmalloc) */ + movl mmu_cr4_features, %eax + movl %eax, %edx + andl $~(1<<7), %edx; # PGE + movl %edx, %cr4; # turn off PGE + movl %cr3, %ecx; # flush TLB + movl %ecx, %cr3 + movl %eax, %cr4; # turn PGE back on + + movl saved_context_esp, %esp + movl saved_context_ebp, %ebp + movl saved_context_ebx, %ebx + movl saved_context_esi, %esi + movl saved_context_edi, %edi + + pushl saved_context_eflags ; popfl + + xorl %eax, %eax + + ret diff --git a/arch/x86/power/suspend.c b/arch/x86/power/suspend.c deleted file mode 100644 index a0020b9..0000000 --- a/arch/x86/power/suspend.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Suspend support specific for i386 - temporary page tables - * - * Distribute under GPLv2 - * - * Copyright (c) 2006 Rafael J. Wysocki - */ - -#include -#include - -#include -#include -#include - -/* Defined in arch/i386/power/swsusp.S */ -extern int restore_image(void); - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -/* Pointer to the temporary resume page tables */ -pgd_t *resume_pg_dir; - -/* The following three functions are based on the analogous code in - * arch/i386/mm/init.c - */ - -/* - * Create a middle page table on a resume-safe page and put a pointer to it in - * the given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t *resume_one_md_table_init(pgd_t *pgd) -{ - pud_t *pud; - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd_table) - return NULL; - - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - - BUG_ON(pmd_table != pmd_offset(pud, 0)); -#else - pud = pud_offset(pgd, 0); - pmd_table = pmd_offset(pud, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table on a resume-safe page and place a pointer to it in - * a middle page directory entry. - */ -static pte_t *resume_one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!page_table) - return NULL; - - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - - BUG_ON(page_table != pte_offset_kernel(pmd, 0)); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. The page tables are allocated out of resume-safe pages. - */ -static int resume_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = resume_one_md_table_init(pgd); - if (!pmd) - return -ENOMEM; - - if (pfn >= max_low_pfn) - continue; - - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) { - if (pfn >= max_low_pfn) - break; - - /* Map with big pages if possible, otherwise create - * normal page tables. - * NOTE: We can mark everything as executable here - */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - pfn += PTRS_PER_PTE; - } else { - pte_t *max_pte; - - pte = resume_one_page_table_init(pmd); - if (!pte) - return -ENOMEM; - - max_pte = pte + PTRS_PER_PTE; - for (; pte < max_pte; pte++, pfn++) { - if (pfn >= max_low_pfn) - break; - - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - } - } - } - } - return 0; -} - -static inline void resume_init_first_level_page_table(pgd_t *pg_dir) -{ -#ifdef CONFIG_X86_PAE - int i; - - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pg_dir + i, - __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); -#endif -} - -int swsusp_arch_resume(void) -{ - int error; - - resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!resume_pg_dir) - return -ENOMEM; - - resume_init_first_level_page_table(resume_pg_dir); - error = resume_physical_mapping_init(resume_pg_dir); - if (error) - return error; - - /* We have got enough memory and from now on we cannot recover */ - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} diff --git a/arch/x86/power/swsusp.S b/arch/x86/power/swsusp.S deleted file mode 100644 index 53662e0..0000000 --- a/arch/x86/power/swsusp.S +++ /dev/null @@ -1,78 +0,0 @@ -.text - -/* Originally gcc generated, modified by hand - * - * This may not use any stack, nor any variable that is not "NoSave": - * - * Its rewriting one kernel image with another. What is stack in "old" - * image could very well be data page in "new" image, and overwriting - * your own stack under you is bad idea. - */ - -#include -#include -#include -#include - - .text - -ENTRY(swsusp_arch_suspend) - - movl %esp, saved_context_esp - movl %ebx, saved_context_ebx - movl %ebp, saved_context_ebp - movl %esi, saved_context_esi - movl %edi, saved_context_edi - pushfl ; popl saved_context_eflags - - call swsusp_save - ret - -ENTRY(restore_image) - movl resume_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 - - movl restore_pblist, %edx - .p2align 4,,7 - -copy_loop: - testl %edx, %edx - jz done - - movl pbe_address(%edx), %esi - movl pbe_orig_address(%edx), %edi - - movl $1024, %ecx - rep - movsl - - movl pbe_next(%edx), %edx - jmp copy_loop - .p2align 4,,7 - -done: - /* go back to the original page tables */ - movl $swapper_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ - movl mmu_cr4_features, %eax - movl %eax, %edx - andl $~(1<<7), %edx; # PGE - movl %edx, %cr4; # turn off PGE - movl %cr3, %ecx; # flush TLB - movl %ecx, %cr3 - movl %eax, %cr4; # turn PGE back on - - movl saved_context_esp, %esp - movl saved_context_ebp, %ebp - movl saved_context_ebx, %ebx - movl saved_context_esi, %esi - movl saved_context_edi, %edi - - pushl saved_context_eflags ; popfl - - xorl %eax, %eax - - ret -- cgit v1.1 From ef8b03fabfbab0738dacbb6c0c38d5af91759ca1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86 PM: consolidate suspend and hibernation code Move the hibernation-specific code from arch/x86/power/suspend_64.c to a separate file (hibernate_64.c) and the CPU-handling code to cpu_64.c (in line with the corresponding 32-bit code). Simplify arch/x86/power/Makefile . Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/Makefile | 9 +- arch/x86/power/cpu_64.c | 166 ++++++++++++++++++++++ arch/x86/power/hibernate_64.c | 169 ++++++++++++++++++++++ arch/x86/power/suspend_64.c | 321 ------------------------------------------ 4 files changed, 337 insertions(+), 328 deletions(-) create mode 100644 arch/x86/power/cpu_64.c create mode 100644 arch/x86/power/hibernate_64.c delete mode 100644 arch/x86/power/suspend_64.c (limited to 'arch/x86') diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 2c95118..9ff4d5b 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,7 +1,2 @@ -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_PM) += suspend_64.o - obj-$(CONFIG_HIBERNATION) += hibernate_asm_64.o -else - obj-$(CONFIG_PM) += cpu_32.o - obj-$(CONFIG_HIBERNATION) += hibernate_32.o hibernate_asm_32.o -endif +obj-$(CONFIG_PM_SLEEP) += cpu_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c new file mode 100644 index 0000000..66bdfb5 --- /dev/null +++ b/arch/x86/power/cpu_64.c @@ -0,0 +1,166 @@ +/* + * Suspend and hibernation support for x86-64 + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include +#include +#include +#include +#include + +static void fix_processor_context(void); + +struct saved_context saved_context; + +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + store_gdt((struct desc_ptr *)&ctxt->gdt_limit); + store_idt((struct desc_ptr *)&ctxt->idt_limit); + store_tr(ctxt->tr); + + /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ + /* + * segment registers + */ + asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + rdmsrl(MSR_FS_BASE, ctxt->fs_base); + rdmsrl(MSR_GS_BASE, ctxt->gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); + + /* + * control registers + */ + rdmsrl(MSR_EFER, ctxt->efer); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); +} + +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} + +static void do_fpu_end(void) +{ + /* + * Restore FPU regs if necessary + */ + kernel_fpu_end(); +} + +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) +{ + /* + * control registers + */ + wrmsrl(MSR_EFER, ctxt->efer); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); + load_idt((const struct desc_ptr *)&ctxt->idt_limit); + + + /* + * segment registers + */ + asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + load_gs_index(ctxt->gs); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + wrmsrl(MSR_FS_BASE, ctxt->fs_base); + wrmsrl(MSR_GS_BASE, ctxt->gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + + fix_processor_context(); + + do_fpu_end(); + mtrr_ap_init(); +} + +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} + +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + + /* + * This just modifies memory; should not be necessary. But... This + * is necessary, because 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + set_tss_desc(cpu, t); + + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7){ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); + } +} diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c new file mode 100644 index 0000000..05f28f0 --- /dev/null +++ b/arch/x86/power/hibernate_64.c @@ -0,0 +1,169 @@ +/* + * Hibernation support for x86-64 + * + * Distribute under GPLv2 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include +#include +#include +#include +#include +#include + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* Defined in arch/x86_64/kernel/suspend_asm.S */ +extern int restore_image(void); + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + +pgd_t *temp_level4_pgt; + +void *relocated_restore_code; + +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +int swsusp_arch_resume(void) +{ + int error; + + /* We have got enough memory and from now on we cannot recover */ + if ((error = set_up_temporary_mappings())) + return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + + restore_image(); + return 0; +} + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} diff --git a/arch/x86/power/suspend_64.c b/arch/x86/power/suspend_64.c deleted file mode 100644 index d51dbf2..0000000 --- a/arch/x86/power/suspend_64.c +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Suspend and hibernation support for x86-64 - * - * Distribute under GPLv2 - * - * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek - * Copyright (c) 2001 Patrick Mochel - */ - -#include -#include -#include -#include -#include -#include - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - -static void fix_processor_context(void); - -struct saved_context saved_context; - -/** - * __save_processor_state - save CPU registers before creating a - * hibernation image and before restoring the memory state from it - * @ctxt - structure to store the registers contents in - * - * NOTE: If there is a CPU register the modification of which by the - * boot kernel (ie. the kernel used for loading the hibernation image) - * might affect the operations of the restored target kernel (ie. the one - * saved in the hibernation image), then its contents must be saved by this - * function. In other words, if kernel A is hibernated and different - * kernel B is used for loading the hibernation image into memory, the - * kernel A's __save_processor_state() function must save all registers - * needed by kernel A, so that it can operate correctly after the resume - * regardless of what kernel B does in the meantime. - */ -static void __save_processor_state(struct saved_context *ctxt) -{ - kernel_fpu_begin(); - - /* - * descriptor tables - */ - store_gdt((struct desc_ptr *)&ctxt->gdt_limit); - store_idt((struct desc_ptr *)&ctxt->idt_limit); - store_tr(ctxt->tr); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* - * segment registers - */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - mtrr_save_fixed_ranges(NULL); - - /* - * control registers - */ - rdmsrl(MSR_EFER, ctxt->efer); - ctxt->cr0 = read_cr0(); - ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); - ctxt->cr8 = read_cr8(); -} - -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} - -static void do_fpu_end(void) -{ - /* - * Restore FPU regs if necessary - */ - kernel_fpu_end(); -} - -/** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from - */ -static void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); - write_cr3(ctxt->cr3); - write_cr2(ctxt->cr2); - write_cr0(ctxt->cr0); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); - load_idt((const struct desc_ptr *)&ctxt->idt_limit); - - - /* - * segment registers - */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); -} - -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} - -static void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - /* - * This just modifies memory; should not be necessary. But... This - * is necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. - */ - set_tss_desc(cpu, t); - - get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } -} - -#ifdef CONFIG_HIBERNATION -/* Defined in arch/x86_64/kernel/suspend_asm.S */ -extern int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3; - -pgd_t *temp_level4_pgt; - -void *relocated_restore_code; - -static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j; - - i = pud_index(address); - pud = pud + i; - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long paddr; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) - break; - - pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) - break; - pe = __PAGE_KERNEL_LARGE_EXEC | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } - } - return 0; -} - -static int set_up_temporary_mappings(void) -{ - unsigned long start, end, next; - int error; - - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) - return -ENOMEM; - - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - - /* Set up the direct mapping from scratch */ - start = (unsigned long)pfn_to_kaddr(0); - end = (unsigned long)pfn_to_kaddr(end_pfn); - - for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!pud) - return -ENOMEM; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) - return error; - set_pgd(temp_level4_pgt + pgd_index(start), - mk_kernel_pgd(__pa(pud))); - } - return 0; -} - -int swsusp_arch_resume(void) -{ - int error; - - /* We have got enough memory and from now on we cannot recover */ - if ((error = set_up_temporary_mappings())) - return error; - - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - memcpy(relocated_restore_code, &core_restore_code, - &restore_registers - &core_restore_code); - - restore_image(); - return 0; -} - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -struct restore_data_record { - unsigned long jump_address; - unsigned long cr3; - unsigned long magic; -}; - -#define RESTORE_MAGIC 0x0123456789ABCDEFUL - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = restore_jump_address; - rdr->cr3 = restore_cr3; - rdr->magic = RESTORE_MAGIC; - return 0; -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; -} -#endif /* CONFIG_HIBERNATION */ -- cgit v1.1 From 261f0ce5ccdd17dc240d8453ca5ffc4688b92700 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86 PM: update stale comments In some suspend and hibernation files in arch/x86/power there are comments referring to arch/x86-64 and arch/i386 . Update them to reflect the current code layout. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/power/hibernate_32.c | 4 ++-- arch/x86/power/hibernate_64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 5080c37..f2b6e3f 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -13,7 +13,7 @@ #include #include -/* Defined in arch/i386/power/swsusp.S */ +/* Defined in hibernate_asm_32.S */ extern int restore_image(void); /* References to section boundaries */ @@ -23,7 +23,7 @@ extern const void __nosave_begin, __nosave_end; pgd_t *resume_pg_dir; /* The following three functions are based on the analogous code in - * arch/i386/mm/init.c + * arch/x86/mm/init_32.c */ /* diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 05f28f0..b542355 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -18,7 +18,7 @@ /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -/* Defined in arch/x86_64/kernel/suspend_asm.S */ +/* Defined in hibernate_asm_64.S */ extern int restore_image(void); /* -- cgit v1.1 From 76ebd0548df6ee48586e9b80d8fc2f58aa5fb51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: introduce page pool in cpa DEBUG_PAGEALLOC was not possible on 64-bit due to its early-bootup hardcoded reliance on PSE pages, and the unrobustness of the runtime splitup of large pages. The splitup ended in recursive calls to alloc_pages() when a page for a pte split was requested. Avoid the recursion with a preallocated page pool, which is used to split up large mappings and gets refilled in the return path of kernel_map_pages after the split has been done. The size of the page pool is adjusted to the available memory. This part just implements the page pool and the initialization w/o using it yet. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ arch/x86/mm/pageattr.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 54aba3c..8106bba 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -664,6 +664,8 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); + cpa_init(); + /* * Subtle. SMP is doing it's boot stuff late (because it has to * fork idle threads) - but it also needs low mappings for the diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 620d2b6..b59fc23 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -528,6 +528,8 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); + + cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index eb2a544..831462c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -336,6 +337,77 @@ out_unlock: return do_split; } +static LIST_HEAD(page_pool); +static unsigned long pool_size, pool_pages, pool_low; +static unsigned long pool_used, pool_failed, pool_refill; + +static void cpa_fill_pool(void) +{ + struct page *p; + gfp_t gfp = GFP_KERNEL; + + /* Do not allocate from interrupt context */ + if (in_irq() || irqs_disabled()) + return; + /* + * Check unlocked. I does not matter when we have one more + * page in the pool. The bit lock avoids recursive pool + * allocations: + */ + if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + return; + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * We could do: + * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; + * but this fails on !PREEMPT kernels + */ + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; +#endif + + while (pool_pages < pool_size) { + p = alloc_pages(gfp, 0); + if (!p) { + pool_failed++; + break; + } + spin_lock_irq(&pgd_lock); + list_add(&p->lru, &page_pool); + pool_pages++; + spin_unlock_irq(&pgd_lock); + } + clear_bit_unlock(0, &pool_refill); +} + +#define SHIFT_MB (20 - PAGE_SHIFT) +#define ROUND_MB_GB ((1 << 10) - 1) +#define SHIFT_MB_GB 10 +#define POOL_PAGES_PER_GB 16 + +void __init cpa_init(void) +{ + struct sysinfo si; + unsigned long gb; + + si_meminfo(&si); + /* + * Calculate the number of pool pages: + * + * Convert totalram (nr of pages) to MiB and round to the next + * GiB. Shift MiB to Gib and multiply the result by + * POOL_PAGES_PER_GB: + */ + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + pool_low = pool_size; + + cpa_fill_pool(); + printk(KERN_DEBUG + "CPA: page pool initialized %lu of %lu pages preallocated\n", + pool_pages, pool_size); +} + static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; @@ -600,7 +672,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * Check whether we really changed something: */ if (!cpa.flushtlb) - return ret; + goto out; /* * No need to flush, when we did not set any of the caching @@ -619,6 +691,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, else cpa_flush_all(cache); +out: + cpa_fill_pool(); return ret; } @@ -772,6 +846,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + /* + * Try to refill the page pool here. We can do this only after + * the tlb flush. + */ + cpa_fill_pool(); } #endif -- cgit v1.1 From eb5b5f024c40f02e9b0f3801173769a726f170fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: cpa, use page pool Switch the split page code to use the page pool. We do this unconditionally to avoid different behaviour with and without DEBUG_PAGEALLOC enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 831462c..e5d29a1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -411,20 +411,29 @@ void __init cpa_init(void) static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; - gfp_t gfp_flags = GFP_KERNEL; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; struct page *base; -#ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = GFP_ATOMIC | __GFP_NOWARN; -#endif - base = alloc_pages(gfp_flags, 0); - if (!base) + /* + * Get a page from the pool. The pool list is protected by the + * pgd_lock, which we have to take anyway for the split + * operation: + */ + spin_lock_irqsave(&pgd_lock, flags); + if (list_empty(&page_pool)) { + spin_unlock_irqrestore(&pgd_lock, flags); return -ENOMEM; + } + + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; - spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page * up for us already: @@ -469,11 +478,17 @@ static int split_large_page(pte_t *kpte, unsigned long address) base = NULL; out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) { + list_add(&base->lru, &page_pool); + pool_pages++; + } else + pool_used++; spin_unlock_irqrestore(&pgd_lock, flags); - if (base) - __free_pages(base, 0); - return 0; } -- cgit v1.1 From b1d95f4e41d6a5969e3a847ceeae8379f30c84c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: cpa, enable CONFIG_DEBUG_PAGEALLOC on 64-bit Now, that the page pool is in place we can enable DEBUG_PAGEALLOC on 64bit. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fa55514..864affc 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -34,13 +34,9 @@ config DEBUG_STACK_USAGE This option will slow down process creation somewhat. -comment "Page alloc debug is incompatible with Software Suspend on i386" - depends on DEBUG_KERNEL && HIBERNATION - depends on X86_32 - config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && X86_32 + depends on DEBUG_KERNEL help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types -- cgit v1.1 From fac84939609a683503947f41eb93e1917d026263 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 9 Feb 2008 23:24:09 +0100 Subject: x86: cpa, strict range check in try_preserve_large_page() Right now, we check only the first 4k page for static required protections. This does not take overlapping regions into account. So we might end up setting the wrong permissions/protections for other parts of this large page. This can be optimized further, but correctness is the important part. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e5d29a1..440210a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -253,10 +253,10 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags; + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int do_split = 1; + int i, do_split = 1; unsigned int level; spin_lock_irqsave(&pgd_lock, flags); @@ -304,6 +304,19 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, new_prot = static_protections(new_prot, address); /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + addr = address + PAGE_SIZE; + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) { + pgprot_t chk_prot = static_protections(new_prot, addr); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + + /* * If there are no changes, return. maxpages has been updated * above: */ -- cgit v1.1