From 32346f47dd46bed291464e194a6c47da6fdd1bc3 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:38 +0200 Subject: [S390] Update default configuration. Signed-off-by: Martin Schwidefsky --- arch/s390/defconfig | 131 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 2aae23d..ece7b99 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22 -# Tue Jul 17 12:50:23 2007 +# Linux kernel version: 2.6.23 +# Mon Oct 22 12:10:44 2007 # CONFIG_MMU=y CONFIG_ZONE_DMA=y @@ -19,15 +19,11 @@ CONFIG_S390=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # -# Code maturity level options +# General setup # CONFIG_EXPERIMENTAL=y CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 - -# -# General setup -# CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y @@ -42,7 +38,14 @@ CONFIG_AUDIT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=17 +CONFIG_CGROUPS=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_CGROUP_NS=y +CONFIG_CGROUP_CPUACCT=y # CONFIG_CPUSETS is not set +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_FAIR_USER_SCHED=y +# CONFIG_FAIR_CGROUP_SCHED is not set CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_BLK_DEV_INITRD=y @@ -63,7 +66,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -83,6 +85,7 @@ CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_BLK_DEV_IO_TRACE is not set CONFIG_BLK_DEV_BSG=y +CONFIG_BLOCK_COMPAT=y # # IO Schedulers @@ -108,7 +111,6 @@ CONFIG_64BIT=y CONFIG_SMP=y CONFIG_NR_CPUS=32 CONFIG_HOTPLUG_CPU=y -CONFIG_DEFAULT_MIGRATION_COST=1000000 CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y CONFIG_AUDIT_ARCH=y @@ -143,9 +145,11 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_HOLES_IN_ZONE=y @@ -219,12 +223,14 @@ CONFIG_INET_TUNNEL=y CONFIG_INET_XFRM_MODE_TRANSPORT=y CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_INET_XFRM_MODE_BEET=y +CONFIG_INET_LRO=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set CONFIG_TCP_CONG_CUBIC=y CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_TCP_MD5SIG is not set +# CONFIG_IP_VS is not set CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set @@ -243,7 +249,48 @@ CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set -# CONFIG_NETFILTER is not set +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NF_CONNTRACK_ENABLED=m +CONFIG_NF_CONNTRACK=m +# CONFIG_NF_CT_ACCT is not set +# CONFIG_NF_CONNTRACK_MARK is not set +# CONFIG_NF_CONNTRACK_EVENTS is not set +# CONFIG_NF_CT_PROTO_SCTP is not set +# CONFIG_NF_CT_PROTO_UDPLITE is not set +# CONFIG_NF_CONNTRACK_AMANDA is not set +# CONFIG_NF_CONNTRACK_FTP is not set +# CONFIG_NF_CONNTRACK_H323 is not set +# CONFIG_NF_CONNTRACK_IRC is not set +# CONFIG_NF_CONNTRACK_NETBIOS_NS is not set +# CONFIG_NF_CONNTRACK_PPTP is not set +# CONFIG_NF_CONNTRACK_SANE is not set +# CONFIG_NF_CONNTRACK_SIP is not set +# CONFIG_NF_CONNTRACK_TFTP is not set +# CONFIG_NF_CT_NETLINK is not set +# CONFIG_NETFILTER_XTABLES is not set + +# +# IP: Netfilter Configuration +# +# CONFIG_NF_CONNTRACK_IPV4 is not set +# CONFIG_IP_NF_QUEUE is not set +# CONFIG_IP_NF_IPTABLES is not set +# CONFIG_IP_NF_ARPTABLES is not set + +# +# IPv6: Netfilter Configuration (EXPERIMENTAL) +# +# CONFIG_NF_CONNTRACK_IPV6 is not set +# CONFIG_IP6_NF_QUEUE is not set +# CONFIG_IP6_NF_IPTABLES is not set # CONFIG_IP_DCCP is not set CONFIG_IP_SCTP=m # CONFIG_SCTP_DBG_MSG is not set @@ -263,12 +310,7 @@ CONFIG_SCTP_HMAC_MD5=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# CONFIG_NET_SCHED=y -CONFIG_NET_SCH_FIFO=y # # Queueing/Scheduling @@ -306,10 +348,12 @@ CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=y # CONFIG_NET_ACT_GACT is not set # CONFIG_NET_ACT_MIRRED is not set +CONFIG_NET_ACT_NAT=m # CONFIG_NET_ACT_PEDIT is not set # CONFIG_NET_ACT_SIMP is not set CONFIG_NET_CLS_POLICE=y # CONFIG_NET_CLS_IND is not set +CONFIG_NET_SCH_FIFO=y # # Network testing @@ -329,6 +373,7 @@ CONFIG_CCW=y # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y # CONFIG_FW_LOADER is not set @@ -400,17 +445,11 @@ CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set - -# -# SCSI low-level drivers -# +# CONFIG_SCSI_SRP_ATTRS is not set +CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_SCSI_DEBUG is not set CONFIG_ZFCP=y - -# -# Multi-device support (RAID and LVM) -# CONFIG_MD=y CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m @@ -429,7 +468,9 @@ CONFIG_DM_ZERO=y CONFIG_DM_MULTIPATH=y # CONFIG_DM_MULTIPATH_EMC is not set # CONFIG_DM_MULTIPATH_RDAC is not set +# CONFIG_DM_MULTIPATH_HP is not set # CONFIG_DM_DELAY is not set +# CONFIG_DM_UEVENT is not set CONFIG_NETDEVICES=y # CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_IFB is not set @@ -438,8 +479,13 @@ CONFIG_BONDING=m # CONFIG_MACVLAN is not set CONFIG_EQUALIZER=m CONFIG_TUN=m +CONFIG_VETH=m CONFIG_NET_ETHERNET=y # CONFIG_MII is not set +# CONFIG_IBM_NEW_EMAC_ZMII is not set +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set CONFIG_NETDEV_1000=y CONFIG_NETDEV_10000=y # CONFIG_TR is not set @@ -473,7 +519,6 @@ CONFIG_CCWGROUP=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 -# CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=m # CONFIG_R3964 is not set CONFIG_RAW_DRIVER=m @@ -490,7 +535,6 @@ CONFIG_TN3270_CONSOLE=y CONFIG_TN3215=y CONFIG_TN3215_CONSOLE=y CONFIG_CCW_CONSOLE=y -CONFIG_SCLP=y CONFIG_SCLP_TTY=y CONFIG_SCLP_CONSOLE=y CONFIG_SCLP_VT220_TTY=y @@ -514,6 +558,11 @@ CONFIG_S390_TAPE_34XX=m CONFIG_MONWRITER=m CONFIG_S390_VMUR=m # CONFIG_POWER_SUPPLY is not set +# CONFIG_WATCHDOG is not set + +# +# Sonics Silicon Backplane +# # # File systems @@ -569,7 +618,6 @@ CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y # CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y CONFIG_CONFIGFS_FS=m # @@ -588,10 +636,7 @@ CONFIG_CONFIGFS_FS=m # CONFIG_QNX4FS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# +CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y # CONFIG_NFS_V3_ACL is not set @@ -638,27 +683,13 @@ CONFIG_MSDOS_PARTITION=y # CONFIG_KARMA_PARTITION is not set # CONFIG_EFI_PARTITION is not set # CONFIG_SYSV68_PARTITION is not set - -# -# Native Language Support -# # CONFIG_NLS is not set - -# -# Distributed Lock Manager -# CONFIG_DLM=m # CONFIG_DLM_DEBUG is not set - -# -# Instrumentation Support -# - -# -# Profiling support -# +CONFIG_INSTRUMENTATION=y # CONFIG_PROFILING is not set CONFIG_KPROBES=y +# CONFIG_MARKERS is not set # # Kernel hacking @@ -682,6 +713,7 @@ CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -694,14 +726,17 @@ CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set +CONFIG_SAMPLES=y # # Security options # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set CONFIG_CRYPTO=y CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_AEAD=m CONFIG_CRYPTO_BLKCIPHER=y CONFIG_CRYPTO_HASH=m CONFIG_CRYPTO_MANAGER=y @@ -720,6 +755,7 @@ CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_PCBC=m # CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_XTS is not set # CONFIG_CRYPTO_CRYPTD is not set # CONFIG_CRYPTO_DES is not set CONFIG_CRYPTO_FCRYPT=m @@ -733,11 +769,13 @@ CONFIG_CRYPTO_FCRYPT=m # CONFIG_CRYPTO_ARC4 is not set # CONFIG_CRYPTO_KHAZAD is not set # CONFIG_CRYPTO_ANUBIS is not set +CONFIG_CRYPTO_SEED=m # CONFIG_CRYPTO_DEFLATE is not set # CONFIG_CRYPTO_MICHAEL_MIC is not set # CONFIG_CRYPTO_CRC32C is not set CONFIG_CRYPTO_CAMELLIA=m # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_SHA1_S390 is not set # CONFIG_CRYPTO_SHA256_S390 is not set @@ -755,5 +793,6 @@ CONFIG_BITREVERSE=m # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=m +CONFIG_CRC7=m # CONFIG_LIBCRC32C is not set CONFIG_PLIST=y -- cgit v1.1 From fae8b22d3e3e3a3d317a7746493997af02a3f35c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Oct 2007 12:52:39 +0200 Subject: [S390] Add per-cpu idle time / idle count sysfs attributes. Add two new sysfs entries per cpu: idle_count and idle_time. idle_count contains the number of times a cpu went into idle state. idle_time contains the time a cpu spent in idle state in microseconds. This can be used e.g. by powertop to tell how often idle state is entered and left. # cat /sys/devices/system/cpu/cpu0/idle_count 504 # cat /sys/devices/system/cpu/cpu0/idle_time 469734037 us Cc: Arjan van de Ven Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/process.c | 16 ++++++++++++ arch/s390/kernel/smp.c | 63 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 70c5737..cc7c4ba 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -44,6 +44,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); @@ -91,6 +92,14 @@ EXPORT_SYMBOL(unregister_idle_notifier); void do_monitor_call(struct pt_regs *regs, long interruption_code) { + struct s390_idle_data *idle; + + idle = &__get_cpu_var(s390_idle); + spin_lock(&idle->lock); + idle->idle_time += get_clock() - idle->idle_enter; + idle->in_idle = 0; + spin_unlock(&idle->lock); + /* disable monitor call class 0 */ __ctl_clear_bit(8, 15); @@ -105,6 +114,7 @@ extern void s390_handle_mcck(void); static void default_idle(void) { int cpu, rc; + struct s390_idle_data *idle; /* CPU is going idle. */ cpu = smp_processor_id(); @@ -142,6 +152,12 @@ static void default_idle(void) return; } + idle = &__get_cpu_var(s390_idle); + spin_lock(&idle->lock); + idle->idle_count++; + idle->in_idle = 1; + idle->idle_enter = get_clock(); + spin_unlock(&idle->lock); trace_hardirqs_on(); /* Wait for external, I/O or machine check interrupt. */ __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 35edbef..ba3fff0 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * An array with a pointer the lowcore of every CPU. @@ -494,6 +495,8 @@ int __cpuinit start_secondary(void *cpuvoid) return 0; } +DEFINE_PER_CPU(struct s390_idle_data, s390_idle); + static void __init smp_create_idle(unsigned int cpu) { struct task_struct *p; @@ -506,6 +509,7 @@ static void __init smp_create_idle(unsigned int cpu) if (IS_ERR(p)) panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p)); current_set[cpu] = p; + spin_lock_init(&(&per_cpu(s390_idle, cpu))->lock); } static int cpu_stopped(int cpu) @@ -724,6 +728,7 @@ void __init smp_prepare_boot_cpu(void) cpu_set(0, cpu_online_map); S390_lowcore.percpu_offset = __per_cpu_offset[0]; current_set[0] = current; + spin_lock_init(&(&__get_cpu_var(s390_idle))->lock); } void __init smp_cpus_done(unsigned int max_cpus) @@ -756,22 +761,71 @@ static ssize_t show_capability(struct sys_device *dev, char *buf) } static SYSDEV_ATTR(capability, 0444, show_capability, NULL); +static ssize_t show_idle_count(struct sys_device *dev, char *buf) +{ + struct s390_idle_data *idle; + unsigned long long idle_count; + + idle = &per_cpu(s390_idle, dev->id); + spin_lock_irq(&idle->lock); + idle_count = idle->idle_count; + spin_unlock_irq(&idle->lock); + return sprintf(buf, "%llu\n", idle_count); +} +static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL); + +static ssize_t show_idle_time(struct sys_device *dev, char *buf) +{ + struct s390_idle_data *idle; + unsigned long long new_time; + + idle = &per_cpu(s390_idle, dev->id); + spin_lock_irq(&idle->lock); + if (idle->in_idle) { + new_time = get_clock(); + idle->idle_time += new_time - idle->idle_enter; + idle->idle_enter = new_time; + } + new_time = idle->idle_time; + spin_unlock_irq(&idle->lock); + return sprintf(buf, "%llu us\n", new_time >> 12); +} +static SYSDEV_ATTR(idle_time, 0444, show_idle_time, NULL); + +static struct attribute *cpu_attrs[] = { + &attr_capability.attr, + &attr_idle_count.attr, + &attr_idle_time.attr, + NULL, +}; + +static struct attribute_group cpu_attr_group = { + .attrs = cpu_attrs, +}; + static int __cpuinit smp_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; struct cpu *c = &per_cpu(cpu_devices, cpu); struct sys_device *s = &c->sysdev; + struct s390_idle_data *idle; switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (sysdev_create_file(s, &attr_capability)) + idle = &per_cpu(s390_idle, cpu); + spin_lock_irq(&idle->lock); + idle->idle_enter = 0; + idle->idle_time = 0; + idle->idle_count = 0; + spin_unlock_irq(&idle->lock); + if (sysfs_create_group(&s->kobj, &cpu_attr_group)) return NOTIFY_BAD; break; case CPU_DEAD: case CPU_DEAD_FROZEN: - sysdev_remove_file(s, &attr_capability); + sysfs_remove_group(&s->kobj, &cpu_attr_group); break; } return NOTIFY_OK; @@ -784,6 +838,7 @@ static struct notifier_block __cpuinitdata smp_cpu_nb = { static int __init topology_init(void) { int cpu; + int rc; register_cpu_notifier(&smp_cpu_nb); @@ -796,7 +851,9 @@ static int __init topology_init(void) if (!cpu_online(cpu)) continue; s = &c->sysdev; - sysdev_create_file(s, &attr_capability); + rc = sysfs_create_group(&s->kobj, &cpu_attr_group); + if (rc) + return rc; } return 0; } -- cgit v1.1 From e3d3683d1402c1737687cb698451d545f57c32a7 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Mon, 22 Oct 2007 12:52:43 +0200 Subject: [S390] kernel: Fix dump on panic for DASDs under LPAR. Currently the ccw method is used to ipl the DASD dump record under LPAR. This mechanism is not reliable, which can cause dump failures. This fix now uses the diag 308 ipl method for all machines, which have diag308 subcode 5 and 4 support. Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ipl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 66b5190..ce0856d 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -648,6 +648,8 @@ static int dump_set_type(enum dump_type type) case DUMP_TYPE_CCW: if (MACHINE_IS_VM) dump_method = DUMP_METHOD_CCW_VM; + else if (diag308_set_works) + dump_method = DUMP_METHOD_CCW_DIAG; else dump_method = DUMP_METHOD_CCW_CIO; break; -- cgit v1.1 From ba8a9229ab9e80278c28ad68b15053f65b2b0a7c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:44 +0200 Subject: [S390] tlb flush fix. The current tlb flushing code for page table entries violates the s390 architecture in a small detail. The relevant section from the principles of operation (SA22-7832-02 page 3-47): "A valid table entry must not be changed while it is attached to any CPU and may be used for translation by that CPU except to (1) invalidate the entry by using INVALIDATE PAGE TABLE ENTRY or INVALIDATE DAT TABLE ENTRY, (2) alter bits 56-63 of a page-table entry, or (3) make a change by means of a COMPARE AND SWAP AND PURGE instruction that purges the TLB." That means if one thread of a multithreaded applciation uses a vma while another thread does an unmap on it, the page table entries of that vma needs to get removed with IPTE, IDTE or CSP. In some strange and rare situations a cpu could check-stop (die) because a entry has been pushed out of the TLB that is still needed to complete a (milli-coded) instruction. I've never seen it happen with the current code on any of the supported machines, so right now this is a theoretical problem. But I want to fix it nevertheless, to avoid headaches in the futures. To get this implemented correctly without changing common code the primitives ptep_get_and_clear, ptep_get_and_clear_full and ptep_set_wrprotect need to use the IPTE instruction to invalidate the pte before the new pte value gets stored. If IPTE is always used for the three primitives three important operations will have a performace hit: fork, mprotect and exit_mmap. Time for some workarounds: * 1: ptep_get_and_clear_full is used in unmap_vmas to remove page tables entries in a batched tlb gather operation. If the mmu_gather context passed to unmap_vmas has been started with full_mm_flush==1 or if only one cpu is online or if the only user of a mm_struct is the current process then the fullmm indication in the mmu_gather context is set to one. All TLBs for mm_struct are flushed by the tlb_gather_mmu call. No new TLBs can be created while the unmap is in progress. In this case ptep_get_and_clear_full clears the ptes with a simple store. * 2: ptep_get_and_clear is used in change_protection to clear the ptes from the page tables before they are reentered with the new access flags. At the end of the update flush_tlb_range clears the remaining TLBs. In general the ptep_get_and_clear has to issue IPTE for each pte and flush_tlb_range is a nop. But if there is only one user of the mm_struct then ptep_get_and_clear uses simple stores to do the update and flush_tlb_range will flush the TLBs. * 3: Similar to 2, ptep_set_wrprotect is used in copy_page_range for a fork to make all ptes of a cow mapping read-only. At the end of of copy_page_range dup_mmap will flush the TLBs with a call to flush_tlb_mm. Check for mm->mm_users and if there is only one user avoid using IPTE in ptep_set_wrprotect and let flush_tlb_mm clear the TLBs. Overall for single threaded programs the tlb flush code now performs better, for multi threaded programs it is slightly worse. In particular exit_mmap() now does a single IDTE for the mm and then just frees every page cache reference and every page table page directly without a delay over the mmu_gather structure. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index ba3fff0..1d97fe1 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -326,7 +326,7 @@ static void smp_ext_bitcall(int cpu, ec_bit_sig sig) */ void smp_ptlb_callback(void *info) { - local_flush_tlb(); + __tlb_flush_local(); } void smp_ptlb_all(void) -- cgit v1.1 From 6f3fa3f0eb8fe4675f8543dd4be3365577e1d487 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:45 +0200 Subject: [S390] Remove unused user_seg from thread structure. Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/process.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index cc7c4ba..96492cf 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -270,14 +270,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp, save_fp_regs(¤t->thread.fp_regs); memcpy(&p->thread.fp_regs, ¤t->thread.fp_regs, sizeof(s390_fp_regs)); - p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _SEGMENT_TABLE; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) p->thread.acrs[0] = regs->gprs[6]; #else /* CONFIG_64BIT */ /* Save the fpu registers to new thread structure. */ save_fp_regs(&p->thread.fp_regs); - p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { if (test_thread_flag(TIF_31BIT)) { -- cgit v1.1 From e4aa402e7a3b6b87d8df6243a37171cdcd2f01c2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:46 +0200 Subject: [S390] Introduce follow_table in uaccess_pt.c Define and use follow_table inline in uaccess_pt.c to simplify the code. Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess_pt.c | 85 ++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index b159a9d..dc37ea8 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -15,6 +15,22 @@ #include #include "uaccess.h" +static inline pte_t *follow_table(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return NULL; + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return NULL; + + return pte_offset_map(pmd, addr); +} + static int __handle_fault(struct mm_struct *mm, unsigned long address, int write_access) { @@ -85,8 +101,6 @@ static size_t __user_copy_pt(unsigned long uaddr, void *kptr, { struct mm_struct *mm = current->mm; unsigned long offset, pfn, done, size; - pgd_t *pgd; - pmd_t *pmd; pte_t *pte; void *from, *to; @@ -94,15 +108,7 @@ static size_t __user_copy_pt(unsigned long uaddr, void *kptr, retry: spin_lock(&mm->page_table_lock); do { - pgd = pgd_offset(mm, uaddr); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto fault; - - pmd = pmd_offset(pgd, uaddr); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto fault; - - pte = pte_offset_map(pmd, uaddr); + pte = follow_table(mm, uaddr); if (!pte || !pte_present(*pte) || (write_user && !pte_write(*pte))) goto fault; @@ -142,22 +148,12 @@ static unsigned long __dat_user_addr(unsigned long uaddr) { struct mm_struct *mm = current->mm; unsigned long pfn, ret; - pgd_t *pgd; - pmd_t *pmd; pte_t *pte; int rc; ret = 0; retry: - pgd = pgd_offset(mm, uaddr); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto fault; - - pmd = pmd_offset(pgd, uaddr); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto fault; - - pte = pte_offset_map(pmd, uaddr); + pte = follow_table(mm, uaddr); if (!pte || !pte_present(*pte)) goto fault; @@ -229,8 +225,6 @@ static size_t strnlen_user_pt(size_t count, const char __user *src) unsigned long uaddr = (unsigned long) src; struct mm_struct *mm = current->mm; unsigned long offset, pfn, done, len; - pgd_t *pgd; - pmd_t *pmd; pte_t *pte; size_t len_str; @@ -240,15 +234,7 @@ static size_t strnlen_user_pt(size_t count, const char __user *src) retry: spin_lock(&mm->page_table_lock); do { - pgd = pgd_offset(mm, uaddr); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto fault; - - pmd = pmd_offset(pgd, uaddr); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto fault; - - pte = pte_offset_map(pmd, uaddr); + pte = follow_table(mm, uaddr); if (!pte || !pte_present(*pte)) goto fault; @@ -308,8 +294,6 @@ static size_t copy_in_user_pt(size_t n, void __user *to, uaddr, done, size; unsigned long uaddr_from = (unsigned long) from; unsigned long uaddr_to = (unsigned long) to; - pgd_t *pgd_from, *pgd_to; - pmd_t *pmd_from, *pmd_to; pte_t *pte_from, *pte_to; int write_user; @@ -317,39 +301,14 @@ static size_t copy_in_user_pt(size_t n, void __user *to, retry: spin_lock(&mm->page_table_lock); do { - pgd_from = pgd_offset(mm, uaddr_from); - if (pgd_none(*pgd_from) || unlikely(pgd_bad(*pgd_from))) { - uaddr = uaddr_from; - write_user = 0; - goto fault; - } - pgd_to = pgd_offset(mm, uaddr_to); - if (pgd_none(*pgd_to) || unlikely(pgd_bad(*pgd_to))) { - uaddr = uaddr_to; - write_user = 1; - goto fault; - } - - pmd_from = pmd_offset(pgd_from, uaddr_from); - if (pmd_none(*pmd_from) || unlikely(pmd_bad(*pmd_from))) { - uaddr = uaddr_from; - write_user = 0; - goto fault; - } - pmd_to = pmd_offset(pgd_to, uaddr_to); - if (pmd_none(*pmd_to) || unlikely(pmd_bad(*pmd_to))) { - uaddr = uaddr_to; - write_user = 1; - goto fault; - } - - pte_from = pte_offset_map(pmd_from, uaddr_from); + pte_from = follow_table(mm, uaddr_from); if (!pte_from || !pte_present(*pte_from)) { uaddr = uaddr_from; write_user = 0; goto fault; } - pte_to = pte_offset_map(pmd_to, uaddr_to); + + pte_to = follow_table(mm, uaddr_to); if (!pte_to || !pte_present(*pte_to) || !pte_write(*pte_to)) { uaddr = uaddr_to; write_user = 1; -- cgit v1.1 From 3610cce87af0693603db171d5b6f6735f5e3dc5b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:47 +0200 Subject: [S390] Cleanup page table definitions. - De-confuse the defines for the address-space-control-elements and the segment/region table entries. - Create out of line functions for page table allocation / freeing. - Simplify get_shadow_xxx functions. Signed-off-by: Martin Schwidefsky --- arch/s390/mm/Makefile | 2 +- arch/s390/mm/init.c | 28 +++++++-------- arch/s390/mm/pgtable.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/s390/mm/vmem.c | 19 ++++------ 4 files changed, 114 insertions(+), 29 deletions(-) create mode 100644 arch/s390/mm/pgtable.c (limited to 'arch') diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f95449b..6640193 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -2,6 +2,6 @@ # Makefile for the linux s390-specific parts of the memory manager. # -obj-y := init.o fault.o extmem.o mmap.o vmem.o +obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o obj-$(CONFIG_CMM) += cmm.o diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3a25bbf..90ec058 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -103,32 +103,28 @@ static void __init setup_ro_region(void) */ void __init paging_init(void) { - pgd_t *pg_dir; - int i; - unsigned long pgdir_k; static const int ssm_mask = 0x04000000L; unsigned long max_zone_pfns[MAX_NR_ZONES]; + unsigned long pgd_type; - pg_dir = swapper_pg_dir; - + init_mm.pgd = swapper_pg_dir; + S390_lowcore.kernel_asce = __pa(init_mm.pgd) & PAGE_MASK; #ifdef CONFIG_64BIT - pgdir_k = (__pa(swapper_pg_dir) & PAGE_MASK) | _KERN_REGION_TABLE; - for (i = 0; i < PTRS_PER_PGD; i++) - pgd_clear_kernel(pg_dir + i); + S390_lowcore.kernel_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION3_ENTRY_EMPTY; #else - pgdir_k = (__pa(swapper_pg_dir) & PAGE_MASK) | _KERNSEG_TABLE; - for (i = 0; i < PTRS_PER_PGD; i++) - pmd_clear_kernel((pmd_t *)(pg_dir + i)); + S390_lowcore.kernel_asce |= _ASCE_TABLE_LENGTH; + pgd_type = _SEGMENT_ENTRY_EMPTY; #endif + clear_table((unsigned long *) init_mm.pgd, pgd_type, + sizeof(unsigned long)*2048); vmem_map_init(); setup_ro_region(); - S390_lowcore.kernel_asce = pgdir_k; - /* enable virtual mapping in kernel mode */ - __ctl_load(pgdir_k, 1, 1); - __ctl_load(pgdir_k, 7, 7); - __ctl_load(pgdir_k, 13, 13); + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); __raw_local_irq_ssm(ssm_mask); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c new file mode 100644 index 0000000..e60e0ae --- /dev/null +++ b/arch/s390/mm/pgtable.c @@ -0,0 +1,94 @@ +/* + * arch/s390/mm/pgtable.c + * + * Copyright IBM Corp. 2007 + * Author(s): Martin Schwidefsky + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef CONFIG_64BIT +#define ALLOC_ORDER 1 +#else +#define ALLOC_ORDER 2 +#endif + +unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) +{ + struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + + if (!page) + return NULL; + page->index = 0; + if (noexec) { + struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + if (!shadow) { + __free_pages(page, ALLOC_ORDER); + return NULL; + } + page->index = page_to_phys(shadow); + } + return (unsigned long *) page_to_phys(page); +} + +void crst_table_free(unsigned long *table) +{ + unsigned long *shadow = get_shadow_table(table); + + if (shadow) + free_pages((unsigned long) shadow, ALLOC_ORDER); + free_pages((unsigned long) table, ALLOC_ORDER); +} + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(int noexec) +{ + struct page *page = alloc_page(GFP_KERNEL); + unsigned long *table; + + if (!page) + return NULL; + page->index = 0; + if (noexec) { + struct page *shadow = alloc_page(GFP_KERNEL); + if (!shadow) { + __free_page(page); + return NULL; + } + table = (unsigned long *) page_to_phys(shadow); + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); + page->index = (addr_t) table; + } + table = (unsigned long *) page_to_phys(page); + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); + return table; +} + +void page_table_free(unsigned long *table) +{ + unsigned long *shadow = get_shadow_pte(table); + + if (shadow) + free_page((unsigned long) shadow); + free_page((unsigned long) table); + +} diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index fd594d5..1bd51d8 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -75,29 +75,24 @@ static void __init_refok *vmem_alloc_pages(unsigned int order) static inline pmd_t *vmem_pmd_alloc(void) { - pmd_t *pmd; - int i; + pmd_t *pmd = NULL; - pmd = vmem_alloc_pages(PMD_ALLOC_ORDER); +#ifdef CONFIG_64BIT + pmd = vmem_alloc_pages(2); if (!pmd) return NULL; - for (i = 0; i < PTRS_PER_PMD; i++) - pmd_clear_kernel(pmd + i); + clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE*4); +#endif return pmd; } static inline pte_t *vmem_pte_alloc(void) { - pte_t *pte; - pte_t empty_pte; - int i; + pte_t *pte = vmem_alloc_pages(0); - pte = vmem_alloc_pages(PTE_ALLOC_ORDER); if (!pte) return NULL; - pte_val(empty_pte) = _PAGE_TYPE_EMPTY; - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = empty_pte; + clear_table((unsigned long *) pte, _PAGE_TYPE_EMPTY, PAGE_SIZE); return pte; } -- cgit v1.1 From 190a1d722a59725706daf832bc8a511ed62f249d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 22 Oct 2007 12:52:48 +0200 Subject: [S390] 4level-fixup cleanup Get independent from asm-generic/4level-fixup.h Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess_pt.c | 7 ++++++- arch/s390/mm/init.c | 4 +++- arch/s390/mm/vmem.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 37 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index dc37ea8..7e8efaa 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -18,13 +18,18 @@ static inline pte_t *follow_table(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return NULL; - pmd = pmd_offset(pgd, addr); + pud = pud_offset(pgd, addr); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return NULL; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 90ec058..b234bb4 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -81,6 +81,7 @@ void show_mem(void) static void __init setup_ro_region(void) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t new_pte; @@ -91,7 +92,8 @@ static void __init setup_ro_region(void) for (; address < end; address += PAGE_SIZE) { pgd = pgd_offset_k(address); - pmd = pmd_offset(pgd, address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); pte = pte_offset_kernel(pmd, address); new_pte = mk_pte_phys(address, __pgprot(_PAGE_RO)); *pte = new_pte; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 1bd51d8..fb9c5a8 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -73,6 +73,8 @@ static void __init_refok *vmem_alloc_pages(unsigned int order) return alloc_bootmem_pages((1 << order) * PAGE_SIZE); } +#define vmem_pud_alloc() ({ BUG(); ((pud_t *) NULL); }) + static inline pmd_t *vmem_pmd_alloc(void) { pmd_t *pmd = NULL; @@ -103,6 +105,7 @@ static int vmem_add_range(unsigned long start, unsigned long size) { unsigned long address; pgd_t *pg_dir; + pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; pte_t pte; @@ -111,13 +114,21 @@ static int vmem_add_range(unsigned long start, unsigned long size) for (address = start; address < start + size; address += PAGE_SIZE) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { + pu_dir = vmem_pud_alloc(); + if (!pu_dir) + goto out; + pgd_populate_kernel(&init_mm, pg_dir, pu_dir); + } + + pu_dir = pud_offset(pg_dir, address); + if (pud_none(*pu_dir)) { pm_dir = vmem_pmd_alloc(); if (!pm_dir) goto out; - pgd_populate_kernel(&init_mm, pg_dir, pm_dir); + pud_populate_kernel(&init_mm, pu_dir, pm_dir); } - pm_dir = pmd_offset(pg_dir, address); + pm_dir = pmd_offset(pu_dir, address); if (pmd_none(*pm_dir)) { pt_dir = vmem_pte_alloc(); if (!pt_dir) @@ -143,6 +154,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size) { unsigned long address; pgd_t *pg_dir; + pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; pte_t pte; @@ -150,9 +162,10 @@ static void vmem_remove_range(unsigned long start, unsigned long size) pte_val(pte) = _PAGE_TYPE_EMPTY; for (address = start; address < start + size; address += PAGE_SIZE) { pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) + pu_dir = pud_offset(pg_dir, address); + if (pud_none(*pu_dir)) continue; - pm_dir = pmd_offset(pg_dir, address); + pm_dir = pmd_offset(pu_dir, address); if (pmd_none(*pm_dir)) continue; pt_dir = pte_offset_kernel(pm_dir, address); @@ -169,6 +182,7 @@ static int vmem_add_mem_map(unsigned long start, unsigned long size) unsigned long address, start_addr, end_addr; struct page *map_start, *map_end; pgd_t *pg_dir; + pud_t *pu_dir; pmd_t *pm_dir; pte_t *pt_dir; pte_t pte; @@ -183,13 +197,21 @@ static int vmem_add_mem_map(unsigned long start, unsigned long size) for (address = start_addr; address < end_addr; address += PAGE_SIZE) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { + pu_dir = vmem_pud_alloc(); + if (!pu_dir) + goto out; + pgd_populate_kernel(&init_mm, pg_dir, pu_dir); + } + + pu_dir = pud_offset(pg_dir, address); + if (pud_none(*pu_dir)) { pm_dir = vmem_pmd_alloc(); if (!pm_dir) goto out; - pgd_populate_kernel(&init_mm, pg_dir, pm_dir); + pud_populate_kernel(&init_mm, pu_dir, pm_dir); } - pm_dir = pmd_offset(pg_dir, address); + pm_dir = pmd_offset(pu_dir, address); if (pmd_none(*pm_dir)) { pt_dir = vmem_pte_alloc(); if (!pt_dir) -- cgit v1.1