diff options
Diffstat (limited to 'fs')
62 files changed, 2012 insertions, 894 deletions
@@ -160,7 +160,7 @@ config EXT4_FS filesystem initially. To compile this file system support as a module, choose M here. The - module will be called ext4dev. + module will be called ext4. If unsure, say N. @@ -1168,195 +1168,7 @@ config EFS_FS To compile the EFS file system support as a module, choose M here: the module will be called efs. -config JFFS2_FS - tristate "Journalling Flash File System v2 (JFFS2) support" - select CRC32 - depends on MTD - help - JFFS2 is the second generation of the Journalling Flash File System - for use on diskless embedded devices. It provides improved wear - levelling, compression and support for hard links. You cannot use - this on normal block devices, only on 'MTD' devices. - - Further information on the design and implementation of JFFS2 is - available at <http://sources.redhat.com/jffs2/>. - -config JFFS2_FS_DEBUG - int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" - depends on JFFS2_FS - default "0" - help - This controls the amount of debugging messages produced by the JFFS2 - code. Set it to zero for use in production systems. For evaluation, - testing and debugging, it's advisable to set it to one. This will - enable a few assertions and will print debugging messages at the - KERN_DEBUG loglevel, where they won't normally be visible. Level 2 - is unlikely to be useful - it enables extra debugging in certain - areas which at one point needed debugging, but when the bugs were - located and fixed, the detailed messages were relegated to level 2. - - If reporting bugs, please try to have available a full dump of the - messages at debug level 1 while the misbehaviour was occurring. - -config JFFS2_FS_WRITEBUFFER - bool "JFFS2 write-buffering support" - depends on JFFS2_FS - default y - help - This enables the write-buffering support in JFFS2. - - This functionality is required to support JFFS2 on the following - types of flash devices: - - NAND flash - - NOR flash with transparent ECC - - DataFlash - -config JFFS2_FS_WBUF_VERIFY - bool "Verify JFFS2 write-buffer reads" - depends on JFFS2_FS_WRITEBUFFER - default n - help - This causes JFFS2 to read back every page written through the - write-buffer, and check for errors. - -config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - This feature makes it possible to use summary information - for faster filesystem mount. - - The summary information can be inserted into a filesystem image - by the utility 'sumtool'. - - If unsure, say 'N'. - -config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config JFFS2_FS_POSIX_ACL - bool "JFFS2 POSIX Access Control Lists" - depends on JFFS2_FS_XATTR - default y - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config JFFS2_FS_SECURITY - bool "JFFS2 Security Labels" - depends on JFFS2_FS_XATTR - default y - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the jffs2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFFS2_COMPRESSION_OPTIONS - bool "Advanced compression options for JFFS2" - depends on JFFS2_FS - default n - help - Enabling this option allows you to explicitly choose which - compression modules, if any, are enabled in JFFS2. Removing - compressors can mean you cannot read existing file systems, - and enabling experimental compressors can mean that you - write a file system which cannot be read by a standard kernel. - - If unsure, you should _definitely_ say 'N'. - -config JFFS2_ZLIB - bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS - select ZLIB_INFLATE - select ZLIB_DEFLATE - depends on JFFS2_FS - default y - help - Zlib is designed to be a free, general-purpose, legally unencumbered, - lossless data-compression library for use on virtually any computer - hardware and operating system. See <http://www.gzip.org/zlib/> for - further information. - - Say 'Y' if unsure. - -config JFFS2_LZO - bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS - select LZO_COMPRESS - select LZO_DECOMPRESS - depends on JFFS2_FS - default n - help - minilzo-based compression. Generally works better than Zlib. - - This feature was added in July, 2007. Say 'N' if you need - compatibility with older bootloaders or kernels. - -config JFFS2_RTIME - bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default y - help - Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. - -config JFFS2_RUBIN - bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default n - help - RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. - -choice - prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS - default JFFS2_CMODE_PRIORITY - depends on JFFS2_FS - help - You can set here the default compression mode of JFFS2 from - the available compression modes. Don't touch if unsure. - -config JFFS2_CMODE_NONE - bool "no compression" - help - Uses no compression. - -config JFFS2_CMODE_PRIORITY - bool "priority" - help - Tries the compressors in a predefined order and chooses the first - successful one. - -config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" - help - Tries all compressors and chooses the one which has the smallest - result. - -config JFFS2_CMODE_FAVOURLZO - bool "Favour LZO" - help - Tries all compressors and chooses the one which has the smallest - result but gives some preference to LZO (which has faster - decompression) at the expense of size. - -endchoice - +source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" @@ -1913,148 +1725,7 @@ config SMB_NLS_REMOTE smbmount from samba 2.2.0 or later supports this. -config CIFS - tristate "CIFS support (advanced network filesystem, SMBFS successor)" - depends on INET - select NLS - help - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block - (SMB) protocol, the native file sharing mechanism for most early - PC operating systems. The CIFS protocol is fully supported by - file servers such as Windows 2000 (including Windows 2003, NT 4 - and Windows XP) as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems). Limited - support for OS/2 and Windows ME and similar servers is provided as - well. - - The cifs module provides an advanced network file system - client for mounting to CIFS compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, - safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. - If you need to mount to Samba or Windows from this machine, say Y. - -config CIFS_STATS - bool "CIFS statistics" - depends on CIFS - help - Enabling this option will cause statistics for each server share - mounted by the cifs client to be displayed in /proc/fs/cifs/Stats - -config CIFS_STATS2 - bool "Extended statistics" - depends on CIFS_STATS - help - Enabling this option will allow more detailed statistics on SMB - request timing to be displayed in /proc/fs/cifs/DebugData and also - allow optional logging of slow responses to dmesg (depending on the - value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). - These additional statistics may have a minor effect on performance - and memory utilization. - - Unless you are a developer or are doing network performance analysis - or tuning, say N. - -config CIFS_WEAK_PW_HASH - bool "Support legacy servers which use weaker LANMAN security" - depends on CIFS - help - Modern CIFS servers including Samba and most Windows versions - (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) - security mechanisms. These hash the password more securely - than the mechanisms used in the older LANMAN version of the - SMB protocol but LANMAN based authentication is needed to - establish sessions with some old SMB servers. - - Enabling this option allows the cifs module to mount to older - LANMAN based servers such as OS/2 and Windows 95, but such - mounts may be less secure than mounts using NTLM or more recent - security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private - network) you probably want to say N. Even if this support - is enabled in the kernel build, LANMAN authentication will not be - used automatically. At runtime LANMAN mounts are disabled but - can be set to required (or optional) either in - /proc/fs/cifs (see fs/cifs/README for more detail) or via an - option on the mount command. This support is disabled by - default in order to reduce the possibility of a downgrade - attack. - - If unsure, say N. - -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - -config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). CIFS maps the name of - extended attributes beginning with the user namespace prefix - to SMB/CIFS EAs. EAs are stored on Windows servers without the - user namespace prefix, but their names are seen by Linux cifs clients - prefaced by the user namespace prefix. The system namespace - (used by some filesystems to store ACLs) is not supported at - this time. - - If unsure, say N. - -config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. - -config CIFS_DEBUG2 - bool "Enable additional CIFS debugging routines" - depends on CIFS - help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. - -config CIFS_EXPERIMENTAL - bool "CIFS Experimental Features (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL - help - Enables cifs features under testing. These features are - experimental and currently include DFS support and directory - change notification ie fcntl(F_DNOTIFY), as well as the upcall - mechanism which will be used for Kerberos session negotiation - and uid remapping. Some of these features also may depend on - setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental - (which is disabled by default). See the file fs/cifs/README - for more details. If unsure, say N. - -config CIFS_DFS_UPCALL - bool "DFS feature support (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which contacts userspace - helper utilities to provide server name resolution (host names to - IP addresses) which is needed for implicit mounts of DFS junction - points. If unsure, say N. +source "fs/cifs/Kconfig" config NCP_FS tristate "NCP file system support (to mount NetWare volumes)" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 801db13..ce9fb3f 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default n + depends on BINFMT_ELF + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say N. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index d0c69f5..2168c90 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c76afa2..e215906 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off) static unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) { +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + /* The vma can be set up to tell us the answer directly. */ if (vma->vm_flags & VM_ALWAYSDUMP) goto whole; + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + } + /* Do not dump I/O mapped devices or special mappings */ if (vma->vm_flags & (VM_IO | VM_RESERVED)) return 0; -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? diff --git a/fs/block_dev.c b/fs/block_dev.c index d84f0469..218408e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device + * @path: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/buffer.c b/fs/buffer.c index ac78d4c..6569fda 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { - smp_mb__before_clear_bit(); - clear_buffer_locked(bh); + clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig new file mode 100644 index 0000000..341a989 --- /dev/null +++ b/fs/cifs/Kconfig @@ -0,0 +1,142 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, NT 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_EXPERIMENTAL + bool "CIFS Experimental Features (EXPERIMENTAL)" + depends on CIFS && EXPERIMENTAL + help + Enables cifs features under testing. These features are + experimental and currently include DFS support and directory + change notification ie fcntl(F_DNOTIFY), as well as the upcall + mechanism which will be used for Kerberos session negotiation + and uid remapping. Some of these features also may depend on + setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental + (which is disabled by default). See the file fs/cifs/README + for more details. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support (EXPERIMENTAL)" + depends on CIFS_EXPERIMENTAL + depends on KEYS + help + Enables an upcall mechanism for CIFS which contacts userspace + helper utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c4a8a06..62d8bd8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { @@ -1386,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, int nr_threads, long signr) +static int format_corename(char *corename, long signr) { const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); @@ -1493,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr) * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern - && (core_uses_pid || nr_threads)) { + if (!ispipe && !pid_in_pattern && core_uses_pid) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1757,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, retval, signr); + ispipe = format_corename(corename, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 92fd033..f5b57a2 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1547,6 +1547,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1585,7 +1586,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 2eea96e..4c82531 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext3_error (sb, "ext3_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ebfec4d..f8424ad 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1186,6 +1186,13 @@ write_begin_failed: ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 77278e9..78fdf38 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 399a96a..3a260af 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext3_show_quota_options(seq, sb); return 0; @@ -754,6 +757,7 @@ enum { Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -796,6 +800,8 @@ static const match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_BARRIER; else journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index bd2ece2..b9821be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6690a41..4880cc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -511,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6a0b40d..445fde6 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -99,9 +99,6 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b4ec9d..8dbf695 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); pagevec_init(&pvec, 0); @@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); - if (!err) + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ mpd->pages_written++; /* * In error case, we have to continue because @@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd) { - long to_write; int ret; if (!mpd->get_block) @@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping, mpd->pages_written = 0; mpd->retval = 0; - to_write = wbc->nr_to_write; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); - /* * Handle last extent of pages */ if (!mpd->io_done && mpd->next_page != mpd->first_page) { if (mpage_da_map_blocks(mpd) == 0) mpage_da_submit_io(mpd); - } - wbc->nr_to_write = to_write - mpd->pages_written; + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - loff_t range_start = 0; struct mpage_da_data mpd; struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; - long to_write, pages_skipped = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* @@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping, nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; wbc->nr_to_write = sbi->s_mb_stream_request; } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; - if (!wbc->range_cyclic) - /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index - */ - wbc->range_cont = 1; - - range_start = wbc->range_start; - pages_skipped = wbc->pages_skipped; + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; mpd.inode = mapping->host; -restart_loop: - to_write = wbc->nr_to_write; - while (!ret && to_write > 0) { + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { /* * we insert one extent at a time. So we need @@ -2422,48 +2436,53 @@ restart_loop: dump_stack(); goto out_writepages; } - to_write -= wbc->nr_to_write; - mpd.get_block = ext4_da_get_block_write; ret = mpage_da_writepages(mapping, wbc, &mpd); ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ jbd2_journal_force_commit_nested(sbi->s_journal); - - /* reset the retry count */ - if (ret == MPAGE_DA_EXTENT_TAIL) { + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* * got one extent now try with * rest of the pages */ - to_write += wbc->nr_to_write; + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; ret = 0; - } else if (wbc->nr_to_write) { + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; - } - - if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { - /* We skipped pages in this loop */ - wbc->range_start = range_start; - wbc->nr_to_write = to_write + - wbc->pages_skipped - pages_skipped; - wbc->pages_skipped = pages_skipped; - goto restart_loop; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write - nr_to_writebump; - wbc->range_start = range_start; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } @@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b580714..dfe17a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); @@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; @@ -2735,10 +2723,14 @@ err_out: remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_proc == NULL) @@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); - +#endif return 0; } @@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4384,35 +4385,20 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; } static noinline_for_stack int @@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } @@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b3b4828..b5dff1f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,23 +100,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; @@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dea8f13..9b2b2bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, @@ -915,7 +855,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_inode_readahead_blks }; @@ -933,8 +873,6 @@ static const match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -973,8 +911,6 @@ static const match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, @@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; @@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); @@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); - goto failed_mount4; - } - lock_kernel(); return 0; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fec8f61..0022eec 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b085d64..963be64 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; atomic_inc(&HFSPLUS_I(inode).opencnt); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ae08c05..25719d9 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal) printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); err = 0; } @@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -762,6 +765,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) @@ -852,6 +858,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 0540ca2..d15cd6e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + if (jh->b_transaction != NULL) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); @@ -1108,7 +1118,7 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } /** diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0abe02c..8b119e1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -995,6 +995,9 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d5405..39b7805 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig new file mode 100644 index 0000000..6ae169c --- /dev/null +++ b/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at <http://sources.redhat.com/jffs2/>. + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See <http://www.gzip.org/zlib/> for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size (EXPERIMENTAL)" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 86739ee..f25e70c 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, } /* jffs2_compress: - * @data: Pointer to uncompressed data - * @cdata: Pointer to returned pointer to buffer for compressed data + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data * @datalen: On entry, holds the amount of data available for compression. * On exit, expected to hold the amount of data actually compressed. * @cdatalen: On entry, holds the amount of space available for compressed diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index cd219ef..b1aaae8 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* FIXME: If you care. We'd need to use frags for the target if it grows much more than this */ if (targetlen > 254) - return -EINVAL; + return -ENAMETOOLONG; ri = jffs2_alloc_raw_inode(); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index dddb2a6..259461b 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr->len = c->sector_size; instr->callback = jffs2_erase_callback; instr->priv = (unsigned long)(&instr[1]); - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 086c438..249305d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; spin_lock(&c->erase_completion_lock); avail = c->dirty_size + c->free_size; @@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ - ri->uid = cpu_to_je16(current->fsuid); + ri->uid = cpu_to_je16(current_fsuid()); if (dir_i->i_mode & S_ISGID) { ri->gid = cpu_to_je16(dir_i->i_gid); if (S_ISDIR(mode)) mode |= S_ISGID; } else { - ri->gid = cpu_to_je16(current->fsgid); + ri->gid = cpu_to_je16(current_fsgid()); } /* POSIX ACLs have to be processed now, at least partly. diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index a9bf960..0875b60 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; + D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); return 0; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 0e78b00..d9a721e 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) memset(c->wbuf,0xff,c->wbuf_pagesize); /* adjust write buffer offset, else we get a non contiguous write bug */ - if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize)) - c->wbuf_ofs += c->wbuf_pagesize; - else - c->wbuf_ofs = 0xffffffff; + c->wbuf_ofs += c->wbuf_pagesize; c->wbuf_len = 0; return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2ab70d4..efdba2e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index d020866..3140a44 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index fbeb2f3..cfb0c80 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 59ea42e..61b25f4 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, unsigned long allowed; struct vmalloc_info vmi; long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; /* * display in kilobytes. @@ -154,51 +156,70 @@ static int meminfo_read_proc(char *page, char **start, off_t off, get_vmalloc_info(&vmi); + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + /* * Tagged format, for easy grepping and expansion. */ len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" #endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" + "Quicklists: %8lu kB\n" #endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), K(cached), K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), +#ifdef CONFIG_UNEVICTABLE_LRU + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 841368b..cd9ca67 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,9 +32,6 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ @@ -647,7 +644,7 @@ static int __init vmcore_init(void) int rc = 0; /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ - if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX)) + if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); if (rc) { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5145cb9..76acdbc 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b131234..f031d1c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/seq_file.c b/fs/seq_file.c index bd20f7f..eba2eab 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) { - size_t len = bitmap_scnprintf_len(nr_bits); + if (m->count < m->size) { + int len = bitmap_scnprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap); - if (m->count + len < m->size) { - bitmap_scnprintf(m->buf + m->count, m->size - m->count, - bits, nr_bits); - m->count += len; - return 0; +int seq_bitmap_list(struct seq_file *m, unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnlistprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } } m->count = m->size; return -1; } +EXPORT_SYMBOL(seq_bitmap_list); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 73db464..1a4973e 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -414,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c) * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - * @c->lst.taken_empty_lebs * - * @empty_lebs are available because they are empty. @freeable_cnt are - * available because they contain only free and dirty space and the - * index allocation always occurs after wbufs are synch'ed. - * @idx_gc_cnt are available because they are index LEBs that have been - * garbage collected (including trivial GC) and are awaiting the commit - * before they can be unmapped - note that the in-the-gaps method will - * grab these if it needs them. @taken_empty_lebs are empty_lebs that - * have already been allocated for some purpose (also includes those - * LEBs on the @idx_gc list). + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. * - * Note, @taken_empty_lebs may temporarily be higher by one because of - * the way we serialize LEB allocations and budgeting. See a comment in - * 'ubifs_find_free_space()'. + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. */ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 5bb51da..a0ada59 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; * * Note, if the input buffer was not compressed, it is copied to the output * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. - * - * This functions returns %0 on success or a negative error code on failure. */ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d7f7645..7186400 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); - printk(KERN_DEBUG "inode %lu\n", inode->i_ino); - printk(KERN_DEBUG "size %llu\n", + printk(KERN_DEBUG "Dump in-memory inode:"); + printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); + printk(KERN_DEBUG "\tsize %llu\n", (unsigned long long)i_size_read(inode)); - printk(KERN_DEBUG "nlink %u\n", inode->i_nlink); - printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid); - printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid); - printk(KERN_DEBUG "atime %u.%u\n", + printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "\tatime %u.%u\n", (unsigned int)inode->i_atime.tv_sec, (unsigned int)inode->i_atime.tv_nsec); - printk(KERN_DEBUG "mtime %u.%u\n", + printk(KERN_DEBUG "\tmtime %u.%u\n", (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); - printk(KERN_DEBUG "ctime %u.%u\n", + printk(KERN_DEBUG "\tctime %u.%u\n", (unsigned int)inode->i_ctime.tv_sec, (unsigned int)inode->i_ctime.tv_nsec); - printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum); - printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size); - printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt); - printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names); - printk(KERN_DEBUG "dirty %u\n", ui->dirty); - printk(KERN_DEBUG "xattr %u\n", ui->xattr); - printk(KERN_DEBUG "flags %d\n", ui->flags); - printk(KERN_DEBUG "compr_type %d\n", ui->compr_type); - printk(KERN_DEBUG "data_len %d\n", ui->data_len); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); + printk(KERN_DEBUG "\txattr %u\n", ui->xattr); + printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); + printk(KERN_DEBUG "\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + printk(KERN_DEBUG "\tui_size %llu\n", + (unsigned long long)ui->ui_size); + printk(KERN_DEBUG "\tflags %d\n", ui->flags); + printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); } void dbg_dump_node(const struct ubifs_info *c, const void *node) @@ -647,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c) } } +void dbg_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); + printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + "cmt %d\n", i + c->lpt_first, c->ltab[i].free, + c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 50315fc..33d6b95 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst); void dbg_dump_budg(struct ubifs_info *c); void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); @@ -249,6 +250,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_cats(struct ubifs_info *c); int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); int dbg_check_synced_i_size(struct inode *inode); int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); int dbg_check_tnc(struct ubifs_info *c, int extra); @@ -367,6 +370,7 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_dump_budg(c) ({}) #define dbg_dump_lprop(c, lp) ({}) #define dbg_dump_lprops(c) ({}) +#define dbg_dump_lpt_info(c) ({}) #define dbg_dump_leb(c, lnum) ({}) #define dbg_dump_znode(c, znode) ({}) #define dbg_dump_heap(c, heap, cat) ({}) @@ -379,6 +383,8 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_check_old_index(c, zroot) 0 #define dbg_check_cats(c) 0 #define dbg_check_ltab(c) 0 +#define dbg_chk_lpt_free_spc(c) 0 +#define dbg_chk_lpt_sz(c, action, len) 0 #define dbg_check_synced_i_size(inode) 0 #define dbg_check_dir_size(c, dir) 0 #define dbg_check_tnc(c, x) 0 diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3d698e2..51cf511 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -147,6 +147,12 @@ static int do_readpage(struct page *page) err = ret; if (err != -ENOENT) break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); } } if (++i >= UBIFS_BLOCKS_PER_PAGE) @@ -577,8 +583,262 @@ out: return copied; } +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(dn->ch.sqnum > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @page1: first page + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct bu_info *bu; + int err, page_idx, page_cnt, ret = 0, n = 0; + loff_t isize; + + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS); + if (!bu) + return 0; + + bu->buf_len = c->bulk_read_buf_size; + bu->buf = kmalloc(bu->buf_len, GFP_NOFS); + if (!bu->buf) + goto out_free; + + data_key_init(c, &bu->key, inode->i_ino, + offset << UBIFS_BLOCKS_PER_PAGE_SHIFT); + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + ui->read_in_a_row = 0; + ui->bulk_read = 0; + goto out_free; + } + + if (bu->cnt) { + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + kfree(bu->buf); + kfree(bu); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + int ret = 0; + + ui->last_page_read = index; + + if (!c->bulk_read) + return 0; + /* + * Bulk-read is protected by ui_mutex, but it is an optimization, so + * don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + ret = ubifs_do_bulk_read(c, page); +out_unlock: + mutex_unlock(&ui->ui_mutex); + return ret; +} + static int ubifs_readpage(struct file *file, struct page *page) { + if (ubifs_bulk_read(page)) + return 0; do_readpage(page); unlock_page(page); return 0; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 47814cd..717d79c 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -901,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c) * it is needed now for this commit. */ lp = ubifs_lpt_lookup_dirty(c, lnum); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, lp->flags | LPROPS_INDEX, -1); - if (unlikely(IS_ERR(lp))) + if (IS_ERR(lp)) return PTR_ERR(lp); dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 02aba36..0bef650 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -96,6 +96,48 @@ static int switch_gc_head(struct ubifs_info *c) } /** + * joinup - bring data nodes for an inode together. + * @c: UBIFS file-system description object + * @sleb: describes scanned LEB + * @inum: inode number + * @blk: block number + * @data: list to which to add data nodes + * + * This function looks at the first few nodes in the scanned LEB @sleb and adds + * them to @data if they are data nodes from @inum and have a larger block + * number than @blk. This function returns %0 on success and a negative error + * code on failure. + */ +static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, + unsigned int blk, struct list_head *data) +{ + int err, cnt = 6, lnum = sleb->lnum, offs; + struct ubifs_scan_node *snod, *tmp; + union ubifs_key *key; + + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + key = &snod->key; + if (key_inum(c, key) == inum && + key_type(c, key) == UBIFS_DATA_KEY && + key_block(c, key) > blk) { + offs = snod->offs; + err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); + if (err < 0) + return err; + list_del(&snod->list); + if (err) { + list_add_tail(&snod->list, data); + blk = key_block(c, key); + } else + kfree(snod); + cnt = 6; + } else if (--cnt == 0) + break; + } + return 0; +} + +/** * move_nodes - move nodes. * @c: UBIFS file-system description object * @sleb: describes nodes to move @@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c) static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) { struct ubifs_scan_node *snod, *tmp; - struct list_head large, medium, small; + struct list_head data, large, medium, small; struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; int avail, err, min = INT_MAX; + unsigned int blk = 0; + ino_t inum = 0; + INIT_LIST_HEAD(&data); INIT_LIST_HEAD(&large); INIT_LIST_HEAD(&medium); INIT_LIST_HEAD(&small); - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - struct list_head *lst; + while (!list_empty(&sleb->nodes)) { + struct list_head *lst = sleb->nodes.next; + + snod = list_entry(lst, struct ubifs_scan_node, list); ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) if (err < 0) goto out; - lst = &snod->list; list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ @@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) } /* - * Sort the list of nodes so that large nodes go first, and - * small nodes go last. + * Sort the list of nodes so that data nodes go first, large + * nodes go second, and small nodes go last. */ - if (snod->len > MEDIUM_NODE_WM) - list_add(lst, &large); + if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { + if (inum != key_inum(c, &snod->key)) { + if (inum) { + /* + * Try to move data nodes from the same + * inode together. + */ + err = joinup(c, sleb, inum, blk, &data); + if (err) + goto out; + } + inum = key_inum(c, &snod->key); + blk = key_block(c, &snod->key); + } + list_add_tail(lst, &data); + } else if (snod->len > MEDIUM_NODE_WM) + list_add_tail(lst, &large); else if (snod->len > SMALL_NODE_WM) - list_add(lst, &medium); + list_add_tail(lst, &medium); else - list_add(lst, &small); + list_add_tail(lst, &small); /* And find the smallest node */ if (snod->len < min) @@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) * Join the tree lists so that we'd have one roughly sorted list * ('large' will be the head of the joined list). */ + list_splice(&data, &large); list_splice(&medium, large.prev); list_splice(&small, large.prev); @@ -653,7 +715,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -665,7 +727,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) if (err) goto out; lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -680,7 +742,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Record index freeable LEBs for unmapping after commit */ while (1) { lp = ubifs_fast_find_frdi_idx(c); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } @@ -696,7 +758,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c) /* Don't release the LEB until after the next commit */ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); - if (unlikely(IS_ERR(lp))) { + if (IS_ERR(lp)) { err = PTR_ERR(lp); kfree(idx_gc); goto out; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 054363f..0168271 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -62,6 +62,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) { if (!c->ro_media) { c->ro_media = 1; + c->no_chk_data_crc = 0; ubifs_warn("switched to read-only mode, error %d", err); dbg_dump_stack(); } @@ -74,6 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock * @quiet: print no messages + * @chk_crc: indicates whether to always check the CRC * * This function checks node magic number and CRC checksum. This function also * validates node length to prevent UBIFS from becoming crazy when an attacker @@ -85,7 +87,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * or magic. */ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet) + int offs, int quiet, int chk_crc) { int err = -EINVAL, type, node_len; uint32_t crc, node_crc, magic; @@ -121,6 +123,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; + if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) { @@ -722,7 +728,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; @@ -781,7 +787,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, goto out; } - err = ubifs_check_node(c, buf, lnum, offs, 0); + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { ubifs_err("expected node type %d", type); return err; diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 8f74760..9ee6508 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c, * @key2: the second key to compare * * This function compares 2 keys and returns %-1 if @key1 is less than - * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2. + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. */ static inline int keys_cmp(const struct ubifs_info *c, const union ubifs_key *key1, @@ -503,6 +503,26 @@ static inline int keys_cmp(const struct ubifs_info *c, } /** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** * is_hash_key - is a key vulnerable to hash collisions. * @c: UBIFS file-system description object * @key: key diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 2ba93da..f27176e 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, } } } + /* Not greater than parent, so compare to children */ while (1) { /* Compare to left child */ @@ -460,18 +461,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) } /** - * ubifs_get_lprops - get reference to LEB properties. - * @c: the UBIFS file-system description object - * - * This function locks lprops. Lprops have to be unlocked by - * 'ubifs_release_lprops()'. - */ -void ubifs_get_lprops(struct ubifs_info *c) -{ - mutex_lock(&c->lp_mutex); -} - -/** * calc_dark - calculate LEB dark space size. * @c: the UBIFS file-system description object * @spc: amount of free and dirty space in the LEB @@ -576,7 +565,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); spin_lock(&c->space_lock); - if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) c->lst.taken_empty_lebs -= 1; @@ -637,31 +625,12 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, c->lst.taken_empty_lebs += 1; change_category(c, lprops); - c->idx_gc_cnt += idx_gc_cnt; - spin_unlock(&c->space_lock); - return lprops; } /** - * ubifs_release_lprops - release lprops lock. - * @c: the UBIFS file-system description object - * - * This function has to be called after each 'ubifs_get_lprops()' call to - * unlock lprops. - */ -void ubifs_release_lprops(struct ubifs_info *c) -{ - ubifs_assert(mutex_is_locked(&c->lp_mutex)); - ubifs_assert(c->lst.empty_lebs >= 0 && - c->lst.empty_lebs <= c->main_lebs); - - mutex_unlock(&c->lp_mutex); -} - -/** * ubifs_get_lp_stats - get lprops statistics. * @c: UBIFS file-system description object * @st: return statistics @@ -1262,7 +1231,6 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; out_print: diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 9ff2463..db8bd0e 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c) c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; c->lpt_sz += c->ltab_sz; - c->lpt_sz += c->lsave_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; /* Add wastage */ sz = c->lpt_sz; @@ -287,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) const int k = 32 - nrbits; uint8_t *p = *addr; int b = *pos; - uint32_t val; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; ubifs_assert(nrbits > 0); ubifs_assert(nrbits <= 32); ubifs_assert(*pos >= 0); ubifs_assert(*pos < 8); if (b) { - val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) | - ((uint32_t)p[4] << 24); + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } val <<= (8 - b); val |= *p >> b; nrbits += b; - } else - val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) | - ((uint32_t)p[3] << 24); + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } val <<= k; val >>= k; b = nrbits & 7; - p += nrbits / 8; + p += nrbits >> 3; *addr = p; *pos = b; ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5f0b83e..eed5a00 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) return 0; } } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c) int lnum, offs, len, alen, done_lsave, done_ltab, err; struct ubifs_cnode *cnode; + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; cnode = c->lpt_cnext; if (!cnode) return 0; @@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { @@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } do { @@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { @@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lpt_offs = offs; } offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c) c->lsave_lnum = lnum; c->lsave_offs = offs; offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, alen - offs); err = alloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c) c->ltab_lnum = lnum; c->ltab_offs = offs; offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; return 0; + +no_space: + ubifs_err("LPT out of space"); + dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) *lnum = i + c->lpt_first; return 0; } - dbg_err("last LEB %d", *lnum); - dump_stack(); return -ENOSPC; } @@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } if (offs + c->ltab_sz <= c->leb_size) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Loop for each cnode */ @@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 4, alen - wlen); } + dbg_chk_lpt_sz(c, 2, 0); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; from = 0; ubifs_assert(lnum >= c->lpt_first && @@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); continue; } if (!done_ltab) { done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); continue; } break; @@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c) clear_bit(COW_ZNODE, &cnode->flags); smp_mb__after_clear_bit(); offs += len; + dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; } while (cnode && cnode != c->lpt_cnext); @@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c) done_lsave = 1; ubifs_pack_lsave(c, buf + offs, c->lsave); offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); } /* Make sure to place LPT's own lprops table */ @@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; + dbg_chk_lpt_sz(c, 2, alen - wlen); err = realloc_lpt_leb(c, &lnum); if (err) - return err; + goto no_space; offs = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); @@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c) done_ltab = 1; ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); } /* Write remaining data in buffer */ @@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c) err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); if (err) return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + c->nhead_lnum = lnum; c->nhead_offs = ALIGN(offs, c->min_io_size); @@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c) dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); if (c->big_lpt) dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + return 0; + +no_space: + ubifs_err("LPT out of space mismatch"); + dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + return err; } /** @@ -1044,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) int pos = 0, node_type, node_len; uint16_t crc, calc_crc; + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); if (node_type == UBIFS_LPT_NOT_A_NODE) return 0; @@ -1156,6 +1203,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c) dbg_lp(""); mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; err = dbg_check_ltab(c); if (err) goto out; @@ -1645,4 +1695,121 @@ int dbg_check_ltab(struct ubifs_info *c) return 0; } +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + dbg_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + dbg_dump_lpt_info(c); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: action + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + long long chk_lpt_sz, lpt_sz; + int err = 0; + + switch (action) { + case 0: + c->chk_lpt_sz = 0; + c->chk_lpt_sz2 = 0; + c->chk_lpt_lebs = 0; + c->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + dbg_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + dbg_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + c->chk_lpt_sz += len; + return 0; + case 2: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + c->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= c->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (c->chk_lpt_sz != chk_lpt_sz) { + dbg_err("LPT wrote %lld but space used was %lld", + c->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz > c->lpt_sz) { + dbg_err("LPT wrote %lld but lpt_sz is %lld", + c->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { + dbg_err("LPT layout size %lld but wrote %lld", + c->chk_lpt_sz, c->chk_lpt_sz2); + err = -EINVAL; + } + if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { + dbg_err("LPT new nhead offs: expected %d was %d", + c->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { + dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) + dbg_dump_lpt_info(c); + c->chk_lpt_sz2 = c->chk_lpt_sz; + c->chk_lpt_sz = 0; + c->chk_lpt_wastage = 0; + c->chk_lpt_lebs = 0; + c->new_nhead_offs = len; + return err; + case 4: + c->chk_lpt_sz += len; + c->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4c12a92..4fa81d8 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -310,4 +310,31 @@ static inline int ubifs_tnc_lookup(struct ubifs_info *c, return ubifs_tnc_locate(c, key, node, NULL, NULL); } +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index acf5c5f..0ed8247 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, dbg_scan("scanning %s", dbg_ntype(ch->node_type)); - if (ubifs_check_node(c, buf, lnum, offs, quiet)) + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) return SCANNED_A_CORRUPT_NODE; if (ch->node_type == UBIFS_PAD_NODE) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a92203..8780efb 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -401,6 +401,16 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) else if (c->mount_opts.unmount_mode == 1) seq_printf(s, ",norm_unmount"); + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + return 0; } @@ -408,13 +418,26 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { struct ubifs_info *c = sb->s_fs_info; int i, ret = 0, err; + long long bud_bytes; - if (c->jheads) + if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err && !ret) ret = err; } + + /* Commit the journal unless it has too little data */ + spin_lock(&c->buds_lock); + bud_bytes = c->bud_bytes; + spin_unlock(&c->buds_lock); + if (bud_bytes > c->leb_size) { + err = ubifs_run_commit(c); + if (err) + return err; + } + } + /* * We ought to call sync for c->ubi but it does not have one. If it had * it would in turn call mtd->sync, however mtd operations are @@ -538,6 +561,18 @@ static int init_constants_early(struct ubifs_info *c) * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + /* Buffer size for bulk-reads */ + c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->bulk_read_buf_size > c->leb_size) + c->bulk_read_buf_size = c->leb_size; + if (c->bulk_read_buf_size > 128 * 1024) { + /* Check if we can kmalloc more than 128KiB */ + void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL); + + kfree(try); + if (!try) + c->bulk_read_buf_size = 128 * 1024; + } return 0; } @@ -840,17 +875,29 @@ static int check_volume_empty(struct ubifs_info *c) * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, Opt_err, }; static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, {Opt_err, NULL}, }; @@ -888,6 +935,22 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, c->mount_opts.unmount_mode = 1; c->fast_unmount = 0; break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; default: ubifs_err("unrecognized mount option \"%s\" " "or missing value", p); @@ -996,6 +1059,8 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } + c->always_chk_crc = 1; + err = ubifs_read_superblock(c); if (err) goto out_free; @@ -1032,8 +1097,6 @@ static int mount_ubifs(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1139,24 +1202,28 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; + c->always_chk_crc = 0; + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); if (mounted_read_only) ubifs_msg("mounted read-only"); x = (long long)c->main_lebs * c->leb_size; - ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->main_lebs); + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->main_lebs); x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)", - x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); - ubifs_msg("media format %d, latest format %d", + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("media format: %d (latest is %d)", c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); dbg_msg("LEB size: %d bytes (%d KiB)", - c->leb_size, c->leb_size / 1024); + c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" @@ -1282,6 +1349,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); c->remounting_rw = 1; + c->always_chk_crc = 1; /* Check for enough free space */ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { @@ -1345,20 +1413,20 @@ static int ubifs_remount_rw(struct ubifs_info *c) /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); - if (!c->bgt) - c->bgt = ERR_PTR(-EINVAL); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err("cannot spawn \"%s\", error %d", c->bgt_name, err); - return err; + goto out; } wake_up_process(c->bgt); c->orph_buf = vmalloc(c->leb_size); - if (!c->orph_buf) - return -ENOMEM; + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1385,6 +1453,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->vfs_sb->s_flags &= ~MS_RDONLY; c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return 0; @@ -1400,6 +1469,7 @@ out: c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; + c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } @@ -1408,12 +1478,9 @@ out: * commit_on_unmount - commit the journal when un-mounting. * @c: UBIFS file-system description object * - * This function is called during un-mounting and it commits the journal unless - * the "fast unmount" mode is enabled. It also avoids committing the journal if - * it contains too few data. - * - * Sometimes recovery requires the journal to be committed at least once, and - * this function takes care about this. + * This function is called during un-mounting and re-mounting, and it commits + * the journal unless the "fast unmount" mode is enabled. It also avoids + * committing the journal if it contains too few data. */ static void commit_on_unmount(struct ubifs_info *c) { diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 7634c59..d27fd91 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, } zn = copy_znode(c, znode); - if (unlikely(IS_ERR(zn))) + if (IS_ERR(zn)) return zn; if (zbr->len) { @@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; + if (type == UBIFS_DATA_NODE && !c->always_chk_crc) + if (c->no_chk_data_crc) + return 0; + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) @@ -1128,7 +1132,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, ubifs_assert(znode == c->zroot.znode); znode = dirty_cow_znode(c, &c->zroot); } - if (unlikely(IS_ERR(znode)) || !p) + if (IS_ERR(znode) || !p) break; ubifs_assert(path[p - 1] >= 0); ubifs_assert(path[p - 1] < znode->child_cnt); @@ -1492,6 +1496,289 @@ out: } /** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubi_read(c->ubi, lnum, buf, offs, len); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubi_read(c->ubi, lnum, buf, offs, rlen); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(&zbr->key), DBGKEY1(&key1)); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dbg_dump_stack(); + dbg_tnc("key %s", DBGKEY(&bu->key)); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** * do_lookup_nm- look up a "hashed" node. * @c: UBIFS file-system description object * @key: node key to lookup @@ -1675,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, { struct ubifs_znode *zn, *zi, *zp; int i, keep, move, appending = 0; - union ubifs_key *key = &zbr->key; + union ubifs_key *key = &zbr->key, *key1; ubifs_assert(n >= 0 && n <= c->fanout); @@ -1716,20 +2003,33 @@ again: zn->level = znode->level; /* Decide where to split */ - if (znode->level == 0 && n == c->fanout && - key_type(c, key) == UBIFS_DATA_KEY) { - union ubifs_key *key1; - - /* - * If this is an inode which is being appended - do not split - * it because no other zbranches can be inserted between - * zbranches of consecutive data nodes anyway. - */ - key1 = &znode->zbranch[n - 1].key; - if (key_inum(c, key1) == key_inum(c, key) && - key_type(c, key1) == UBIFS_DATA_KEY && - key_block(c, key1) == key_block(c, key) - 1) - appending = 1; + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } } if (appending) { @@ -1759,6 +2059,8 @@ again: zbr->znode->parent = zn; } +do_split: + __set_bit(DIRTY_ZNODE, &zn->flags); atomic_long_inc(&c->dirty_zn_cnt); @@ -1785,14 +2087,11 @@ again: /* Insert new znode (produced by spitting) into the parent */ if (zp) { - i = n; + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + /* Locate insertion point */ n = znode->iip + 1; - if (appending && n != c->fanout) - appending = 0; - - if (i == 0 && zi == znode && znode->iip == 0) - correct_parent_keys(c, znode); /* Tail recursion */ zbr->key = zn->zbranch[0].key; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index a25c1cc..b48db99 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, } /* Make sure the key of the read node is correct */ - key_read(c, key, &key1); - if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) { + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { ubifs_err("bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); dbg_tnc("looked for key %s found node's key %s", diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index a9ecbd9..0b37804 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -75,7 +75,6 @@ */ #define UBIFS_BLOCK_SIZE 4096 #define UBIFS_BLOCK_SHIFT 12 -#define UBIFS_BLOCK_MASK 0x00000FFF /* UBIFS padding byte pattern (must not be first or last byte of node magic) */ #define UBIFS_PADDING_BYTE 0xCE diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 17c620b..a7bd32f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -142,6 +142,9 @@ /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + /* * Lockdep classes for UBIFS inode @ui_mutex. */ @@ -328,9 +331,10 @@ struct ubifs_gced_idx_leb { * this inode * @dirty: non-zero if the inode is dirty * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used * @ui_mutex: serializes inode write-back with the rest of VFS operations, - * serializes "clean <-> dirty" state changes, protects @dirty, - * @ui_size, and @xattr_size + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size * @ui_lock: protects @synced_i_size * @synced_i_size: synchronized size of inode, i.e. the value of inode size * currently stored on the flash; used only for regular file @@ -338,6 +342,8 @@ struct ubifs_gced_idx_leb { * @ui_size: inode size used by UBIFS when writing to flash * @flags: inode flags (@UBIFS_COMPR_FL, etc) * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data * @@ -379,12 +385,15 @@ struct ubifs_inode { unsigned int xattr_names; unsigned int dirty:1; unsigned int xattr:1; + unsigned int bulk_read:1; struct mutex ui_mutex; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; int flags; int compr_type; + pgoff_t last_page_read; + pgoff_t read_in_a_row; int data_len; void *data; }; @@ -698,8 +707,8 @@ struct ubifs_jhead { * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. * @key: key * @znode: znode address in memory - * @lnum: LEB number of the indexing node - * @offs: offset of the indexing node within @lnum + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum * @len: target node length */ struct ubifs_zbranch { @@ -744,6 +753,28 @@ struct ubifs_znode { }; /** + * struct bu_info - bulk-read information + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** * struct ubifs_node_range - node length range description data structure. * @len: fixed node length * @min_len: minimum possible node length @@ -862,9 +893,13 @@ struct ubifs_orphan { /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable bulk-reads + * @chk_data_crc: check CRCs when reading data nodes */ struct ubifs_mount_opts { unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; }; /** @@ -905,13 +940,12 @@ struct ubifs_mount_opts { * @cmt_state: commit state * @cs_lock: commit state lock * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * * @fast_unmount: do not run journal commit before un-mounting * @big_lpt: flag that LPT is too big to write whole during commit - * @check_lpt_free: flag that indicates LPT GC may be needed - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -935,6 +969,7 @@ struct ubifs_mount_opts { * @mst_node: master node * @mst_offs: offset of valid master node * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * @bulk_read_buf_size: buffer size for bulk-reads * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes @@ -977,12 +1012,17 @@ struct ubifs_mount_opts { * but which still have to be taken into account because * the index has not been committed so far * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst; + * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, + * @nospace, and @nospace_rp; * @min_idx_lebs: minimum number of LEBs required for the index * @old_idx_sz: size of index on flash * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) * @lst: lprops statistics + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full * * @page_budget: budget for a page * @inode_budget: budget for an inode @@ -1061,6 +1101,7 @@ struct ubifs_mount_opts { * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab * @dirty_nn_cnt: number of dirty nnodes * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed * @lpt_sz: LPT size * @lpt_nod_buf: buffer for an on-flash nnode or pnode * @lpt_buf: buffer of LEB size used by LPT @@ -1102,6 +1143,7 @@ struct ubifs_mount_opts { * @rcvrd_mst_node: recovered master node to write when mounting ro to rw * @size_tree: inode size information for recovery * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) + * @always_chk_crc: always check CRCs (while mounting and remounting rw) * @mount_opts: UBIFS-specific mount options * * @dbg_buf: a buffer of LEB size used for debugging purposes @@ -1146,11 +1188,11 @@ struct ubifs_info { int cmt_state; spinlock_t cs_lock; wait_queue_head_t cmt_wq; + unsigned int fast_unmount:1; unsigned int big_lpt:1; - unsigned int check_lpt_free:1; - unsigned int nospace:1; - unsigned int nospace_rp:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1175,6 +1217,7 @@ struct ubifs_info { struct ubifs_mst_node *mst_node; int mst_offs; struct mutex mst_mutex; + int bulk_read_buf_size; int log_lebs; long long log_bytes; @@ -1218,6 +1261,8 @@ struct ubifs_info { unsigned long long old_idx_sz; unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; + unsigned int nospace:1; + unsigned int nospace_rp:1; int page_budget; int inode_budget; @@ -1294,6 +1339,7 @@ struct ubifs_info { int lpt_drty_flgs; int dirty_nn_cnt; int dirty_pn_cnt; + int check_lpt_free; long long lpt_sz; void *lpt_nod_buf; void *lpt_buf; @@ -1335,6 +1381,7 @@ struct ubifs_info { struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; int remounting_rw; + int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG @@ -1347,6 +1394,12 @@ struct ubifs_info { unsigned long fail_timeout; unsigned int fail_cnt; unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_lnum; + int new_nhead_offs; #endif }; @@ -1377,7 +1430,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, int offs, int dtype); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, - int offs, int quiet); + int offs, int quiet, int chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); int ubifs_io_init(struct ubifs_info *c); @@ -1490,6 +1543,8 @@ void destroy_old_idx(struct ubifs_info *c); int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs); int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); /* tnc_misc.c */ struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, @@ -1586,12 +1641,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c); void ubifs_lpt_free(struct ubifs_info *c, int wr_only); /* lprops.c */ -void ubifs_get_lprops(struct ubifs_info *c); const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, int free, int dirty, int flags, int idx_gc_cnt); -void ubifs_release_lprops(struct ubifs_info *c); void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, int cat); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 649bec7..cfd31e2 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -446,7 +446,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int type; xent = ubifs_tnc_next_ent(c, &key, &nm); - if (unlikely(IS_ERR(xent))) { + if (IS_ERR(xent)) { err = PTR_ERR(xent); break; } |