diff options
Diffstat (limited to 'fs')
104 files changed, 4272 insertions, 2175 deletions
@@ -30,15 +30,6 @@ config FS_MBCACHE source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" -config FS_POSIX_ACL -# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) -# -# NOTE: you can implement Posix ACLs without these helpers (XFS does). -# Never use this symbol for ifdefs. -# - bool - default n - source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" @@ -47,6 +38,14 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +# Posix ACL utility routines +# +# Note: Posix ACLs can be implemented without these helpers. Never use +# this symbol for ifdefs in core code. +# +config FS_POSIX_ACL + def_bool n + config EXPORTFS tristate diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e6a4ab9..20c106f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -66,6 +66,7 @@ const struct dentry_operations afs_fs_dentry_operations = { .d_revalidate = afs_d_revalidate, .d_delete = afs_d_delete, .d_release = afs_d_release, + .d_automount = afs_d_automount, }; #define AFS_DIR_HASHTBL_SIZE 128 diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0747339..db66c52 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, inode->i_generation = 0; set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - inode->i_flags |= S_NOATIME; + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; unlock_new_inode(inode); _leave(" = %p", inode); return inode; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 58c633b..5a9b684 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -592,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; +extern struct vfsmount *afs_d_automount(struct path *); extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index e83c033..aa59184 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); static int afs_mntpt_open(struct inode *inode, struct file *file); -static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd); static void afs_mntpt_expiry_timed_out(struct work_struct *work); const struct file_operations afs_mntpt_file_operations = { @@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = { const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, - .follow_link = afs_mntpt_follow_link, .readlink = page_readlink, .getattr = afs_getattr, }; const struct inode_operations afs_autocell_inode_operations = { - .follow_link = afs_mntpt_follow_link, .getattr = afs_getattr, }; @@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) _debug("symlink is a mountpoint"); spin_lock(&vnode->lock); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + vnode->vfs_inode.i_flags |= S_AUTOMOUNT; spin_unlock(&vnode->lock); } @@ -238,52 +236,24 @@ error_no_devname: } /* - * follow a link from a mountpoint directory, thus causing it to be mounted + * handle an automount point */ -static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - int err; - _enter("%p{%s},{%s:%p{%s},}", - dentry, - dentry->d_name.name, - nd->path.mnt->mnt_devname, - dentry, - nd->path.dentry->d_name.name); - - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); - - newmnt = afs_mntpt_do_automount(nd->path.dentry); - if (IS_ERR(newmnt)) { - path_put(&nd->path); - return (void *)newmnt; - } + _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name); - mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts); - switch (err) { - case 0: - path_put(&nd->path); - nd->path.mnt = newmnt; - nd->path.dentry = dget(newmnt->mnt_root); - queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, - afs_mntpt_expiry_timeout * HZ); - break; - case -EBUSY: - /* someone else made a mount here whilst we were busy */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - default: - mntput(newmnt); - break; - } + newmnt = afs_mntpt_do_automount(path->dentry); + if (IS_ERR(newmnt)) + return newmnt; - _leave(" = %d", err); - return ERR_PTR(err); + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &afs_vfsmounts); + queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, + afs_mntpt_expiry_timeout * HZ); + _leave(" = %p {%s}", newmnt, newmnt->mnt_devname); + return newmnt; } /* @@ -87,7 +87,7 @@ static int __init aio_setup(void) aio_wq = create_workqueue("aio"); abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); - BUG_ON(!abe_pool); + BUG_ON(!aio_wq || !abe_pool); pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index cbe57f3..c5567cb 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -233,7 +233,7 @@ static int __init anon_inode_init(void) return 0; err_mntput: - mntput_long(anon_inode_mnt); + mntput(anon_inode_mnt); err_unregister_filesystem: unregister_filesystem(&anon_inode_fs_type); err_exit: diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 0fffe1c..54f9237 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -88,18 +88,9 @@ struct autofs_info { uid_t uid; gid_t gid; - - mode_t mode; - size_t size; - - void (*free)(struct autofs_info *); - union { - const char *symlink; - } u; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ -#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ struct autofs_wait_queue { @@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry) return 0; } -static inline void autofs4_copy_atime(struct file *src, struct file *dst) -{ - dst->f_path.dentry->d_inode->i_atime = - src->f_path.dentry->d_inode->i_atime; - return; -} - -struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *); +struct inode *autofs4_get_inode(struct super_block *, mode_t); void autofs4_free_ino(struct autofs_info *); /* Expiration */ @@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void); extern const struct inode_operations autofs4_symlink_inode_operations; extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct inode_operations autofs4_root_inode_operations; -extern const struct inode_operations autofs4_indirect_root_inode_operations; -extern const struct inode_operations autofs4_direct_root_inode_operations; extern const struct file_operations autofs4_dir_operations; extern const struct file_operations autofs4_root_operations; +extern const struct dentry_operations autofs4_dentry_operations; + +/* VFS automount flags management functions */ + +static inline void __managed_dentry_set_automount(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_set_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_automount(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT; +} + +static inline void managed_dentry_clear_automount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_transit(struct dentry *dentry) +{ + dentry->d_flags |= DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_set_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_transit(struct dentry *dentry) +{ + dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT; +} + +static inline void managed_dentry_clear_transit(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_transit(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_set_managed(struct dentry *dentry) +{ + dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_set_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_managed(struct dentry *dentry) +{ + dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_clear_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_managed(dentry); + spin_unlock(&dentry->d_lock); +} /* Initializing function */ int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode); +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); +void autofs4_clean_ino(struct autofs_info *); /* Queue management functions */ @@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); -static inline int autofs4_follow_mount(struct path *path) -{ - int res = 0; - - while (d_mountpoint(path->dentry)) { - int followed = follow_down(path); - if (!followed) - break; - res = 1; - } - return res; -} - static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); @@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) return; } -void autofs4_dentry_release(struct dentry *); extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index eff9a41..1442da4 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); - if (follow_down(&path)) + if (follow_down_one(&path)) magic = path.mnt->mnt_sb->s_magic; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index cc1d013..f43100b 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry, if (ino == NULL) return 0; - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - return 0; - if (!do_now) { /* Too young to die */ if (!timeout || time_after(ino->last_used + timeout, now)) @@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) path_get(&path); - if (!follow_down(&path)) + if (!follow_down_one(&path)) goto done; if (is_autofs4_dentry(path.dentry)) { @@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *p, *ret; if (prev == NULL) - return dget(prev); + return dget(root); spin_lock(&autofs4_lock); relock: @@ -137,7 +133,7 @@ again: spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); /* Negative dentry - try next */ if (!simple_positive(ret)) { - spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); p = ret; goto again; } @@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, unsigned long timeout; struct dentry *root = dget(sb->s_root); int do_now = how & AUTOFS_EXP_IMMEDIATE; + struct autofs_info *ino; if (!root) return NULL; @@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, timeout = sbi->exp_timeout; spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + return NULL; + } + managed_dentry_set_transit(root); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); - if (d_mountpoint(root)) { - ino->flags |= AUTOFS_INF_MOUNTPOINT; - spin_lock(&root->d_lock); - root->d_flags &= ~DCACHE_MOUNTED; - spin_unlock(&root->d_lock); - } ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + managed_dentry_clear_transit(root); spin_unlock(&sbi->fs_lock); dput(root); @@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, while ((dentry = get_next_positive_dentry(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + goto cont; + managed_dentry_set_transit(dentry); /* * Case 1: (i) indirect mount or top level pseudo direct mount @@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: + managed_dentry_clear_transit(dentry); +cont: spin_unlock(&sbi->fs_lock); } return NULL; @@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; + if (!d_unhashed(dentry)) + managed_dentry_clear_transit(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_MOUNTPOINT) { - spin_lock(&sb->s_root->d_lock); - /* - * If we haven't been expired away, then reset - * mounted status. - */ - if (mnt->mnt_parent != mnt) - sb->s_root->d_flags |= DCACHE_MOUNTED; - spin_unlock(&sb->s_root->d_lock); - ino->flags &= ~AUTOFS_INF_MOUNTPOINT; - } ino->flags &= ~AUTOFS_INF_EXPIRING; + spin_lock(&dentry->d_lock); + if (ret) + __managed_dentry_clear_transit(dentry); + else { + if ((IS_ROOT(dentry) || + (autofs_type_indirect(sbi->type) && + IS_ROOT(dentry->d_parent))) && + !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + __managed_dentry_set_automount(dentry); + } + spin_unlock(&dentry->d_lock); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index a7bdb9d..180fa24 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -22,77 +22,27 @@ #include "autofs_i.h" #include <linux/module.h> -static void ino_lnkfree(struct autofs_info *ino) +struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) { - if (ino->u.symlink) { - kfree(ino->u.symlink); - ino->u.symlink = NULL; - } -} - -struct autofs_info *autofs4_init_ino(struct autofs_info *ino, - struct autofs_sb_info *sbi, mode_t mode) -{ - int reinit = 1; - - if (ino == NULL) { - reinit = 0; - ino = kmalloc(sizeof(*ino), GFP_KERNEL); - } - - if (ino == NULL) - return NULL; - - if (!reinit) { - ino->flags = 0; - ino->inode = NULL; - ino->dentry = NULL; - ino->size = 0; + struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + if (ino) { INIT_LIST_HEAD(&ino->active); - ino->active_count = 0; INIT_LIST_HEAD(&ino->expiring); - atomic_set(&ino->count, 0); + ino->last_used = jiffies; + ino->sbi = sbi; } + return ino; +} +void autofs4_clean_ino(struct autofs_info *ino) +{ ino->uid = 0; ino->gid = 0; - ino->mode = mode; ino->last_used = jiffies; - - ino->sbi = sbi; - - if (reinit && ino->free) - (ino->free)(ino); - - memset(&ino->u, 0, sizeof(ino->u)); - - ino->free = NULL; - - if (S_ISLNK(mode)) - ino->free = ino_lnkfree; - - return ino; } void autofs4_free_ino(struct autofs_info *ino) { - struct autofs_info *p_ino; - - if (ino->dentry) { - ino->dentry->d_fsdata = NULL; - if (ino->dentry->d_inode) { - struct dentry *parent = ino->dentry->d_parent; - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(parent); - if (p_ino && parent != ino->dentry) - atomic_dec(&p_ino->count); - } - dput(ino->dentry); - } - ino->dentry = NULL; - } - if (ino->free) - (ino->free)(ino); kfree(ino); } @@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } +static void autofs4_evict_inode(struct inode *inode) +{ + end_writeback(inode); + kfree(inode->i_private); +} + static const struct super_operations autofs4_sops = { .statfs = simple_statfs, .show_options = autofs4_show_options, + .evict_inode = autofs4_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, return (*pipefd < 0); } -static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi) -{ - struct autofs_info *ino; - - ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755); - if (!ino) - return NULL; - - return ino; -} - -static const struct dentry_operations autofs4_sb_dentry_operations = { - .d_release = autofs4_dentry_release, -}; - int autofs4_fill_super(struct super_block *s, void *data, int silent) { struct inode * root_inode; @@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs4_sops; + s->s_d_op = &autofs4_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_mkroot(sbi); + ino = autofs4_new_ino(sbi); if (!ino) goto fail_free; - root_inode = autofs4_get_inode(s, ino); + root_inode = autofs4_get_inode(s, S_IFDIR | 0755); if (!root_inode) goto fail_ino; @@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_iput; pipe = NULL; - d_set_d_op(root, &autofs4_sb_dentry_operations); root->d_fsdata = ino; /* Can this call block? */ @@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_dput; } + if (autofs_type_trigger(sbi->type)) + __managed_dentry_set_managed(root); + root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = autofs_type_trigger(sbi->type) ? - &autofs4_direct_root_inode_operations : - &autofs4_indirect_root_inode_operations; + root_inode->i_op = &autofs4_dir_inode_operations; /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || @@ -383,16 +326,14 @@ fail_unlock: return -EINVAL; } -struct inode *autofs4_get_inode(struct super_block *sb, - struct autofs_info *inf) +struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode) { struct inode *inode = new_inode(sb); if (inode == NULL) return NULL; - inf->inode = inode; - inode->i_mode = inf->mode; + inode->i_mode = mode; if (sb->s_root) { inode->i_uid = sb->s_root->d_inode->i_uid; inode->i_gid = sb->s_root->d_inode->i_gid; @@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = get_next_ino(); - if (S_ISDIR(inf->mode)) { + if (S_ISDIR(mode)) { inode->i_nlink = 2; inode->i_op = &autofs4_dir_inode_operations; inode->i_fop = &autofs4_dir_operations; - } else if (S_ISLNK(inf->mode)) { - inode->i_size = inf->size; + } else if (S_ISLNK(mode)) { inode->i_op = &autofs4_symlink_inode_operations; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 651e4ef..014e7ab 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); -static void *autofs4_follow_link(struct dentry *, struct nameidata *); - -#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) -#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE) +static struct vfsmount *autofs4_d_automount(struct path *); +static int autofs4_d_manage(struct dentry *, bool, bool); +static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, @@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = { .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_indirect_root_inode_operations = { +const struct inode_operations autofs4_dir_inode_operations = { .lookup = autofs4_lookup, .unlink = autofs4_dir_unlink, .symlink = autofs4_dir_symlink, @@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = { .rmdir = autofs4_dir_rmdir, }; -const struct inode_operations autofs4_direct_root_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, - .follow_link = autofs4_follow_link, -}; - -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct dentry_operations autofs4_dentry_operations = { + .d_automount = autofs4_d_automount, + .d_manage = autofs4_d_manage, + .d_release = autofs4_dentry_release, }; static void autofs4_add_active(struct dentry *dentry) @@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry) return; } -static unsigned int autofs4_need_mount(unsigned int flags) -{ - unsigned int res = 0; - if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS)) - res = 1; - return res; -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; @@ -158,278 +139,27 @@ out: return dcache_dir_open(inode, file); } -static int try_to_fill_dentry(struct dentry *dentry, int flags) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - int status; - - DPRINTK("dentry=%p %.*s ino=%p", - dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - - /* - * Wait for a pending mount, triggering one if there - * isn't one already - */ - if (dentry->d_inode == NULL) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_MOUNT); - - DPRINTK("mount done status=%d", status); - - /* Turn this into a real negative dentry? */ - if (status == -ENOENT) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - return status; - } else if (status) { - /* Return a negative dentry, but leave it "pending" */ - return status; - } - /* Trigger mount for path component or follow link */ - } else if (ino->flags & AUTOFS_INF_PENDING || - autofs4_need_mount(flags)) { - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); - - spin_lock(&sbi->fs_lock); - ino->flags |= AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); - - DPRINTK("mount done status=%d", status); - - if (status) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - return status; - } - } - - /* Initialize expiry counter after successful mount */ - ino->last_used = jiffies; - - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - - return 0; -} - -/* For autofs direct mounts the follow link triggers the mount */ -static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); - int oz_mode = autofs4_oz_mode(sbi); - unsigned int lookup_type; - int status; - - DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d", - dentry, dentry->d_name.len, dentry->d_name.name, oz_mode, - nd->flags); - /* - * For an expire of a covered direct or offset mount we need - * to break out of follow_down() at the autofs mount trigger - * (d_mounted--), so we can see the expiring flag, and manage - * the blocking and following here until the expire is completed. - */ - if (oz_mode) { - spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { - spin_unlock(&sbi->fs_lock); - /* Follow down to our covering mount. */ - if (!follow_down(&nd->path)) - goto done; - goto follow; - } - spin_unlock(&sbi->fs_lock); - goto done; - } - - /* If an expire request is pending everyone must wait. */ - autofs4_expire_wait(dentry); - - /* We trigger a mount for almost all flags */ - lookup_type = autofs4_need_mount(nd->flags); - spin_lock(&sbi->fs_lock); - spin_lock(&autofs4_lock); - spin_lock(&dentry->d_lock); - if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) { - spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); - spin_unlock(&sbi->fs_lock); - goto follow; - } - - /* - * If the dentry contains directories then it is an autofs - * multi-mount with no root mount offset. So don't try to - * mount it again. - */ - if (ino->flags & AUTOFS_INF_PENDING || - (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { - spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); - spin_unlock(&sbi->fs_lock); - - status = try_to_fill_dentry(dentry, nd->flags); - if (status) - goto out_error; - - goto follow; - } - spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); - spin_unlock(&sbi->fs_lock); -follow: - /* - * If there is no root mount it must be an autofs - * multi-mount with no root offset so we don't need - * to follow it. - */ - if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path)) { - status = -ENOENT; - goto out_error; - } - } - -done: - return NULL; - -out_error: - path_put(&nd->path); - return ERR_PTR(status); -} - -/* - * Revalidate is called on every cache lookup. Some of those - * cache lookups may actually happen while the dentry is not - * yet completely filled in, and revalidate has to delay such - * lookups.. - */ -static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) +static void autofs4_dentry_release(struct dentry *de) { - struct inode *dir; - struct autofs_sb_info *sbi; - int oz_mode; - int flags = nd ? nd->flags : 0; - int status = 1; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - dir = dentry->d_parent->d_inode; - sbi = autofs4_sbi(dir->i_sb); - oz_mode = autofs4_oz_mode(sbi); - - /* Pending dentry */ - spin_lock(&sbi->fs_lock); - if (autofs4_ispending(dentry)) { - /* The daemon never causes a mount to trigger */ - spin_unlock(&sbi->fs_lock); - - if (oz_mode) - return 1; - - /* - * If the directory has gone away due to an expire - * we have been called as ->d_revalidate() and so - * we need to return false and proceed to ->lookup(). - */ - if (autofs4_expire_wait(dentry) == -EAGAIN) - return 0; - - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry, flags); - if (status == 0) - return 1; - - return status; - } - spin_unlock(&sbi->fs_lock); - - /* Negative dentry.. invalidate if "old" */ - if (dentry->d_inode == NULL) - return 0; - - /* Check for a non-mountpoint directory with no contents */ - spin_lock(&autofs4_lock); - spin_lock(&dentry->d_lock); - if (S_ISDIR(dentry->d_inode->i_mode) && - !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - DPRINTK("dentry=%p %.*s, emptydir", - dentry, dentry->d_name.len, dentry->d_name.name); - spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); - - /* The daemon never causes a mount to trigger */ - if (oz_mode) - return 1; - - /* - * A zero status is success otherwise we have a - * negative error code. - */ - status = try_to_fill_dentry(dentry, flags); - if (status == 0) - return 1; - - return status; - } - spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); - - return 1; -} - -void autofs4_dentry_release(struct dentry *de) -{ - struct autofs_info *inf; + struct autofs_info *ino = autofs4_dentry_ino(de); + struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); DPRINTK("releasing %p", de); - inf = autofs4_dentry_ino(de); - de->d_fsdata = NULL; - - if (inf) { - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); - - if (sbi) { - spin_lock(&sbi->lookup_lock); - if (!list_empty(&inf->active)) - list_del(&inf->active); - if (!list_empty(&inf->expiring)) - list_del(&inf->expiring); - spin_unlock(&sbi->lookup_lock); - } - - inf->dentry = NULL; - inf->inode = NULL; + if (!ino) + return; - autofs4_free_ino(inf); + if (sbi) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del(&ino->active); + if (!list_empty(&ino->expiring)) + list_del(&ino->expiring); + spin_unlock(&sbi->lookup_lock); } -} -/* For dentries of directories in the root dir */ -static const struct dentry_operations autofs4_root_dentry_operations = { - .d_revalidate = autofs4_revalidate, - .d_release = autofs4_dentry_release, -}; - -/* For other dentries */ -static const struct dentry_operations autofs4_dentry_operations = { - .d_revalidate = autofs4_revalidate, - .d_release = autofs4_dentry_release, -}; + autofs4_free_ino(ino); +} static struct dentry *autofs4_lookup_active(struct dentry *dentry) { @@ -541,51 +271,246 @@ next: return NULL; } +static int autofs4_mount_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + if (ino->flags & AUTOFS_INF_PENDING) { + DPRINTK("waiting for mount name=%.*s", + dentry->d_name.len, dentry->d_name.name); + status = autofs4_wait(sbi, dentry, NFY_MOUNT); + DPRINTK("mount wait done status=%d", status); + ino->last_used = jiffies; + return status; + } + return 0; +} + +static int do_expire_wait(struct dentry *dentry) +{ + struct dentry *expiring; + + expiring = autofs4_lookup_expiring(dentry); + if (!expiring) + return autofs4_expire_wait(dentry); + else { + /* + * If we are racing with expire the request might not + * be quite complete, but the directory has been removed + * so it must have been successful, just wait for it. + */ + autofs4_expire_wait(expiring); + autofs4_del_expiring(expiring); + dput(expiring); + } + return 0; +} + +static struct dentry *autofs4_mountpoint_changed(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + /* + * If this is an indirect mount the dentry could have gone away + * as a result of an expire and a new one created. + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + struct dentry *new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; + dput(path->dentry); + path->dentry = new; + } + return path->dentry; +} + +static struct vfsmount *autofs4_d_automount(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* + * Someone may have manually umounted this or it was a submount + * that has gone away. + */ + spin_lock(&dentry->d_lock); + if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { + if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) && + (dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + __managed_dentry_set_transit(path->dentry); + } + spin_unlock(&dentry->d_lock); + + /* The daemon never triggers a mount. */ + if (autofs4_oz_mode(sbi)) + return NULL; + + /* + * If an expire request is pending everyone must wait. + * If the expire fails we're still mounted so continue + * the follow and return. A return of -EAGAIN (which only + * happens with indirect mounts) means the expire completed + * and the directory was removed, so just go ahead and try + * the mount. + */ + status = do_expire_wait(dentry); + if (status && status != -EAGAIN) + return NULL; + + /* Callback to the daemon to perform the mount or wait */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + goto done; + } + + /* + * If the dentry is a symlink it's equivalent to a directory + * having d_mountpoint() true, so there's no need to call back + * to the daemon. + */ + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) + goto done; + if (!d_mountpoint(dentry)) { + /* + * It's possible that user space hasn't removed directories + * after umounting a rootless multi-mount, although it + * should. For v5 have_submounts() is sufficient to handle + * this because the leaves of the directory tree under the + * mount never trigger mounts themselves (they have an autofs + * trigger mount mounted on them). But v4 pseudo direct mounts + * do need the leaves to to trigger mounts. In this case we + * have no choice but to use the list_empty() check and + * require user space behave. + */ + if (sbi->version > 4) { + if (have_submounts(dentry)) + goto done; + } else { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&dentry->d_lock); + goto done; + } + spin_unlock(&dentry->d_lock); + } + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs4_mount_wait(dentry); + if (status) + return ERR_PTR(status); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + } +done: + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path updated + * so turn this into a normal dentry so we don't continually + * call ->d_automount() and ->d_manage(). + */ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_transit(dentry); + /* + * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and + * symlinks as in all other cases the dentry will be covered by + * an actual mount so ->d_automount() won't be called during + * the follow. + */ + if ((!d_mountpoint(dentry) && + !list_empty(&dentry->d_subdirs)) || + (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + __managed_dentry_clear_automount(dentry); + spin_unlock(&dentry->d_lock); + } + spin_unlock(&sbi->fs_lock); + + /* Mount succeeded, check if we ended up with a new dentry */ + dentry = autofs4_mountpoint_changed(path); + if (!dentry) + return ERR_PTR(-ENOENT); + + return NULL; +} + +int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + + DPRINTK("dentry=%p %.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + /* The daemon never waits. */ + if (autofs4_oz_mode(sbi) || mounting_here) { + if (!d_mountpoint(dentry)) + return -EISDIR; + return 0; + } + + /* We need to sleep, so we need pathwalk to be in ref-mode */ + if (rcu_walk) + return -ECHILD; + + /* Wait for pending expires */ + do_expire_wait(dentry); + + /* + * This dentry may be under construction so wait on mount + * completion. + */ + return autofs4_mount_wait(dentry); +} + /* Lookups in the root directory */ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi; struct autofs_info *ino; - struct dentry *expiring, *active; - int oz_mode; + struct dentry *active; - DPRINTK("name = %.*s", - dentry->d_name.len, dentry->d_name.name); + DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); sbi = autofs4_sbi(dir->i_sb); - oz_mode = autofs4_oz_mode(sbi); DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", - current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); active = autofs4_lookup_active(dentry); if (active) { - dentry = active; - ino = autofs4_dentry_ino(dentry); + return active; } else { /* - * Mark the dentry incomplete but don't hash it. We do this - * to serialize our inode creation operations (symlink and - * mkdir) which prevents deadlock during the callback to - * the daemon. Subsequent user space lookups for the same - * dentry are placed on the wait queue while the daemon - * itself is allowed passage unresticted so the create - * operation itself can then hash the dentry. Finally, - * we check for the hashed dentry and return the newly - * hashed dentry. + * A dentry that is not within the root can never trigger a + * mount operation, unless the directory already exists, so we + * can return fail immediately. The daemon however does need + * to create directories within the file system. */ - d_set_d_op(dentry, &autofs4_root_dentry_operations); + if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + return ERR_PTR(-ENOENT); - /* - * And we need to ensure that the same dentry is used for - * all following lookup calls until it is hashed so that - * the dentry flags are persistent throughout the request. - */ - ino = autofs4_init_ino(NULL, sbi, 0555); + /* Mark entries in the root as mount triggers */ + if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + __managed_dentry_set_managed(dentry); + + ino = autofs4_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); @@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s d_instantiate(dentry, NULL); } - - if (!oz_mode) { - mutex_unlock(&dir->i_mutex); - expiring = autofs4_lookup_expiring(dentry); - if (expiring) { - /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - */ - autofs4_expire_wait(expiring); - autofs4_del_expiring(expiring); - dput(expiring); - } - - spin_lock(&sbi->fs_lock); - ino->flags |= AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - if (dentry->d_op && dentry->d_op->d_revalidate) - (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); - } - - /* - * If we are still pending, check if we had to handle - * a signal. If so we can force a restart.. - */ - if (ino->flags & AUTOFS_INF_PENDING) { - /* See if we were interrupted */ - if (signal_pending(current)) { - sigset_t *sigset = ¤t->pending.signal; - if (sigismember (sigset, SIGKILL) || - sigismember (sigset, SIGQUIT) || - sigismember (sigset, SIGINT)) { - if (active) - dput(active); - return ERR_PTR(-ERESTARTNOINTR); - } - } - if (!oz_mode) { - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - } - } - - /* - * If this dentry is unhashed, then we shouldn't honour this - * lookup. Returning ENOENT here doesn't do the right thing - * for all system calls, but it should be OK for the operations - * we permit from an autofs. - */ - if (!oz_mode && d_unhashed(dentry)) { - /* - * A user space application can (and has done in the past) - * remove and re-create this directory during the callback. - * This can leave us with an unhashed dentry, but a - * successful mount! So we need to perform another - * cached lookup in case the dentry now exists. - */ - struct dentry *parent = dentry->d_parent; - struct dentry *new = d_lookup(parent, &dentry->d_name); - if (new != NULL) - dentry = new; - else - dentry = ERR_PTR(-ENOENT); - - if (active) - dput(active); - - return dentry; - } - - if (active) - return active; - return NULL; } @@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir, struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; + size_t size = strlen(symname); char *cp; DPRINTK("%s <- %.*s", symname, @@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir, if (!autofs4_oz_mode(sbi)) return -EACCES; - ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); - if (!ino) - return -ENOMEM; + BUG_ON(!ino); + + autofs4_clean_ino(ino); autofs4_del_active(dentry); - ino->size = strlen(symname); - cp = kmalloc(ino->size + 1, GFP_KERNEL); - if (!cp) { - if (!dentry->d_fsdata) - kfree(ino); + cp = kmalloc(size + 1, GFP_KERNEL); + if (!cp) return -ENOMEM; - } strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, ino); + inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); if (!dentry->d_fsdata) kfree(ino); return -ENOMEM; } + inode->i_private = cp; + inode->i_size = size; d_add(dentry, inode); - if (dir == dir->i_sb->s_root->d_inode) - d_set_d_op(dentry, &autofs4_root_dentry_operations); - else - d_set_d_op(dentry, &autofs4_dentry_operations); - - dentry->d_fsdata = ino; - ino->dentry = dget(dentry); + dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_inc(&p_ino->count); - ino->inode = inode; - ino->u.symlink = cp; dir->i_mtime = CURRENT_TIME; return 0; @@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) return 0; } +/* + * Version 4 of autofs provides a pseudo direct mount implementation + * that relies on directories at the leaves of a directory tree under + * an indirect mount to trigger mounts. To allow for this we need to + * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves + * of the directory tree. There is no need to clear the automount flag + * following a mount or restore it after an expire because these mounts + * are always covered. However, it is neccessary to ensure that these + * flags are clear on non-empty directories to avoid unnecessary calls + * during path walks. + */ +static void autofs_set_leaf_automount_flags(struct dentry *dentry) +{ + struct dentry *parent; + + /* root and dentrys in the root are already handled */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_set_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + managed_dentry_clear_managed(parent); + return; +} + +static void autofs_clear_leaf_automount_flags(struct dentry *dentry) +{ + struct list_head *d_child; + struct dentry *parent; + + /* flags for dentrys in the root are handled elsewhere */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_clear_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + d_child = &dentry->d_u.d_child; + /* Set parent managed if it's becoming empty */ + if (d_child->next == &parent->d_subdirs && + d_child->prev == &parent->d_subdirs) + managed_dentry_set_managed(parent); + return; +} + static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); @@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); spin_unlock(&autofs4_lock); + if (sbi->version < 5) + autofs_clear_leaf_automount_flags(dentry); + if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) @@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) DPRINTK("dentry %p, creating %.*s", dentry, dentry->d_name.len, dentry->d_name.name); - ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); - if (!ino) - return -ENOMEM; + BUG_ON(!ino); + + autofs4_clean_ino(ino); autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, ino); - if (!inode) { - if (!dentry->d_fsdata) - kfree(ino); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + if (!inode) return -ENOMEM; - } d_add(dentry, inode); - if (dir == dir->i_sb->s_root->d_inode) - d_set_d_op(dentry, &autofs4_root_dentry_operations); - else - d_set_d_op(dentry, &autofs4_dentry_operations); + if (sbi->version < 5) + autofs_set_leaf_automount_flags(dentry); - dentry->d_fsdata = ino; - ino->dentry = dget(dentry); + dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_inc(&p_ino->count); - ino->inode = inode; inc_nlink(dir); dir->i_mtime = CURRENT_TIME; @@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) int is_autofs4_dentry(struct dentry *dentry) { return dentry && dentry->d_inode && - (dentry->d_op == &autofs4_root_dentry_operations || - dentry->d_op == &autofs4_dentry_operations) && + dentry->d_op == &autofs4_dentry_operations && dentry->d_fsdata != NULL; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index b4ea829..f27c094 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -14,8 +14,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); - nd_set_link(nd, (char *)ino->u.symlink); + nd_set_link(nd, dentry->d_inode->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index c5f8459..5601005 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait, * completed while we waited on the mutex ... */ if (notify == NFY_MOUNT) { + struct dentry *new = NULL; + int valid = 1; + /* * If the dentry was successfully mounted while we slept * on the wait queue mutex we can return success. If it @@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait, * a multi-mount with no mount at it's base) we can * continue on and create a new request. */ + if (!IS_ROOT(dentry)) { + if (dentry->d_inode && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); + if (new) + dentry = new; + } + } if (have_submounts(dentry)) - return 0; + valid = 0; + + if (new) + dput(new); + return valid; } return 1; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c02..ecb9fd3 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,8 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a35eb36..31610ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ae2c8c..15b5ca2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) + if (IS_ERR(acl)) { + kfree(value); return acl; + } set_cached_acl(inode, type, acl); } kfree(value); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6ad63f1..ccc991c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -157,7 +157,7 @@ struct btrfs_inode { /* * always compress this one file */ - unsigned force_compress:1; + unsigned force_compress:4; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b50bc4b..f745287 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -62,6 +62,9 @@ struct compressed_bio { /* number of bytes on disk */ unsigned long compressed_len; + /* the compression algorithm for this bio */ + int compress_type; + /* number of compressed pages in the array */ unsigned long nr_pages; @@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); csum_failed: if (ret) cb->errors = 1; @@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / @@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_put(comp_bio); return 0; } + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +int __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } + return 0; +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add_tail(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -1; + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void __exit btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*page_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*page_index)++; + if (*page_index >= vcnt) + return 0; + + page_out = bvec[*page_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 421f5b4..5100017 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,24 +19,27 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); -void btrfs_zlib_exit(void); +int btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset); + int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, @@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9ac1715..b5baff0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { + if (!p) + return; btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } @@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + btrfs_tree_lock(left); btrfs_set_lock_blocking(left); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a142d20..2c98b3a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/kobject.h> #include <asm/kmap_types.h> #include "extent_io.h" #include "extent_map.h" @@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FSID_SIZE 16 #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ + +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) @@ -398,13 +407,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* * A leaf is full of items. offset and size tell us where to find @@ -551,9 +562,11 @@ struct btrfs_timespec { } __attribute__ ((__packed__)); enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LAST = 2, + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, }; struct btrfs_inode_item { @@ -597,6 +610,8 @@ struct btrfs_dir_item { u8 type; } __attribute__ ((__packed__)); +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -895,7 +910,8 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; u64 open_ioctl_trans; - unsigned long mount_opt; + unsigned long mount_opt:20; + unsigned long compress_type:4; u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; @@ -1050,6 +1066,9 @@ struct btrfs_fs_info { unsigned metadata_ratio; void *bdev_holder; + + /* filesystem state */ + u64 fs_state; }; /* @@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; +} + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); @@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno); + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ +} while (0) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51d2e4d..b531c36 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,20 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static int btrfs_destroy_ordered_operations(struct btrfs_root *root); +static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + WARN_ON(1); + goto out; + } ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); @@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + ret = -EIO; + goto out; + } found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } btrfs_free_path(path); if (ret) { + kfree(root); if (ret > 0) ret = -ENOENT; return ERR_PTR(ret); @@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) + if (!bh) { + err = -EINVAL; goto fail_iput; + } memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); memcpy(&fs_info->super_for_commit, &fs_info->super_copy, @@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_super_root(disk_super)) goto fail_iput; + /* check FS state, whether FS is broken. */ + fs_info->fs_state |= btrfs_super_flags(disk_super); + + btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; @@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, } features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); - } + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; @@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_set_opt(fs_info->mount_opt, SSD); } - if (btrfs_super_log_root(disk_super) != 0) { + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0 && + !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root) smp_mb(); btrfs_put_block_group_cache(fs_info); + + /* + * Here come 2 situations when btrfs is broken to flip readonly: + * + * 1. when btrfs flips readonly somewhere else before + * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, + * and btrfs will skip to write sb directly to keep + * ERROR state on disk. + * + * 2. when btrfs flips readonly just in btrfs_commit_super, + * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, + * btrfs will cleanup all FS resources first and write sb then. + */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + ret = btrfs_error_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } @@ -2619,6 +2671,352 @@ out: return 0; } +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + if (read_only) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + printk(KERN_WARNING "warning: mount fs with errors, " + "running btrfsck is recommended\n"); +} + +int btrfs_error_commit_super(struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); + + ret = write_ctree_super(NULL, root, 0); + + return ret; +} + +static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct list_head splice; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + ordered = list_entry(splice.next, struct btrfs_ordered_extent, + root_extent_list); + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* the inode may be getting freed (in sys_unlink path). */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + if (inode) + iput(inode); + + atomic_set(&ordered->refs, 1); + btrfs_put_ordered_extent(ordered); + + spin_lock(&root->fs_info->ordered_extent_lock); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (delayed_refs->num_entries == 0) { + printk(KERN_INFO "delayed_refs has NO entry\n"); + return ret; + } + + node = rb_first(&delayed_refs->root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + atomic_set(&ref->refs, 1); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + mutex_lock(&head->mutex); + kfree(head->extent_op); + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + list_del_init(&head->cluster); + mutex_unlock(&head->mutex); + } + + spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref(ref); + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +{ + struct btrfs_pending_snapshot *snapshot; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&t->pending_snapshots, &splice); + + while (!list_empty(&splice)) { + snapshot = list_entry(splice.next, + struct btrfs_pending_snapshot, + list); + + list_del_init(&snapshot->list); + + kfree(snapshot); + } + + return 0; +} + +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + + spin_lock(&root->fs_info->delalloc_lock); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->delalloc_lock); + + return 0; +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + u64 offset; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + offset = page_offset(page); + + spin_lock(&dirty_pages->buffer_lock); + eb = radix_tree_lookup( + &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, + offset >> PAGE_CACHE_SHIFT); + spin_unlock(&dirty_pages->buffer_lock); + if (eb) { + ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags); + atomic_set(&eb->refs, 1); + } + if (PageWriteback(page)) + end_page_writeback(page); + + lock_page(page); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&page->mapping->tree_lock); + } + + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = pinned_extents; + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + /* opt_discard */ + ret = btrfs_error_discard_extent(root, start, end + 1 - start); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + return 0; +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + LIST_HEAD(list); + + WARN_ON(1); + + mutex_lock(&root->fs_info->trans_mutex); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + list_splice_init(&root->fs_info->trans_list, &list); + while (!list_empty(&list)) { + t = list_entry(list.next, struct btrfs_transaction, list); + if (!t) + break; + + btrfs_destroy_ordered_operations(root); + + btrfs_destroy_ordered_extents(root); + + btrfs_destroy_delayed_refs(t, root); + + btrfs_block_rsv_release(root, + &root->fs_info->trans_block_rsv, + t->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + t->in_commit = 1; + t->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + t->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + t->commit_done = 1; + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + + btrfs_destroy_pending_snapshots(t); + + btrfs_destroy_delalloc_inodes(root); + + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + + btrfs_destroy_marked_extents(root, &t->dirty_pages, + EXTENT_DIRTY); + + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + t->use_count = 0; + list_del_init(&t->list); + memset(t, 0, sizeof(*t)); + kmem_cache_free(btrfs_transaction_cachep, t); + } + + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 88e825a..07b20dc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); +int btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 227e581..b552693 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) return btrfs_reduce_alloc_profile(root, flags); } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; @@ -3161,8 +3161,12 @@ alloc: bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret < 0) - return ret; + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } if (!data_sinfo) { btrfs_set_inode_space_info(root, inode); @@ -3173,6 +3177,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ +commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); @@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return -ENOSPC; } @@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache) if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } + spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -8012,6 +8013,62 @@ out: return ret; } +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + + return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + int i; + u64 free_bytes = 0; + + spin_lock(&sinfo->lock); + + for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); + + spin_unlock(&sinfo->lock); + + return free_bytes; +} + int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough @@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free) { ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + &dev_offset, NULL); if (!ret) break; ret = -1; @@ -8584,3 +8641,14 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes); +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e86b9f..2e993cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { this_bio_flag = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); @@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; eb->start = start; eb->len = len; spin_lock_init(&eb->lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4183c81..7083cfa 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,8 +20,12 @@ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) -/* flags for bio submission */ +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 @@ -135,6 +139,17 @@ struct extent_buffer { wait_queue_head_t lock_wq; }; +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + struct extent_map_tree; static inline struct extent_state *extent_state_next(struct extent_state *state) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23cb8da..b0e1fce 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> +#include "ctree.h" #include "extent_map.h" @@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask) return em; em->in_tree = 0; em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; atomic_set(&em->refs, 1); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ab6d74b..28b44db 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,7 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - int in_tree; + unsigned int in_tree:1; + unsigned int compress_type:4; }; struct extent_map_tree { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d8..c800d58 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> @@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); free_extent_map(split); @@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->len = em->start + em->len - (start + len); split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; if (compressed) { split->block_len = em->block_len; @@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err) goto out; + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + err = -EROFS; + goto out; + } + file_update_time(file); BTRFS_I(inode)->sequence++; @@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = { .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3798a3..160b55b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; size_t datasize; unsigned long offset; - int use_compress = 0; + int compress_type = BTRFS_COMPRESS_NONE; if (compressed_size && compressed_pages) { - use_compress = 1; + compress_type = root->fs_info->compress_type; cur_size = compressed_size; } @@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); - if (use_compress) { + if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { @@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - BTRFS_COMPRESS_ZLIB); + compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); @@ -263,6 +263,7 @@ struct async_extent { u64 compressed_size; struct page **pages; unsigned long nr_pages; + int compress_type; struct list_head list; }; @@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, + int compress_type) { struct async_extent *async_extent; @@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow, async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode, unsigned long max_uncompressed = 128 * 1024; int i; int will_compress; + int compress_type = root->fs_info->compress_type; actual_end = min_t(u64, isize, end + 1); again: @@ -381,12 +385,16 @@ again: WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - ret = btrfs_zlib_compress_pages(inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); if (!ret) { unsigned long offset = total_compressed & @@ -493,7 +501,8 @@ again: * and will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret); + total_compressed, pages, nr_pages_ret, + compress_type); if (start + num_bytes < end) { start += num_bytes; @@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; } @@ -640,6 +650,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -656,11 +667,13 @@ retry: async_extent->ram_size - 1, 0); } - ret = btrfs_add_ordered_extent(inode, async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED); + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); BUG_ON(ret); /* @@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - int compressed = 0; + int compress_type = 0; int ret; bool nolock = false; @@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compressed = 1; + compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compressed); + BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + @@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - compressed, 0, 0, + compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, @@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); } failrec->logical = logical; free_extent_map(em); @@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; int err; + if (btrfs_root_readonly(root)) + return -EROFS; + err = inode_change_ok(inode, attr); if (err) return err; @@ -4928,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path, size_t max_size; unsigned long inline_size; unsigned long ptr; + int compress_type; WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(leaf, path->slots[0])); @@ -4939,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_zlib_decompress(tmp, page, extent_offset, - inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); if (ret) { char *kaddr = kmap_atomic(page, KM_USER0); unsigned long copy_size = min_t(u64, @@ -4982,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compressed; + int compress_type; again: read_lock(&em_tree->lock); @@ -5041,7 +5062,7 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compressed = btrfs_file_extent_compression(leaf, item); + compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -5087,8 +5108,9 @@ again: em->block_start = EXTENT_MAP_HOLE; goto insert; } - if (compressed) { + if (compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; em->block_start = bytenr; em->block_len = btrfs_file_extent_disk_num_bytes(leaf, item); @@ -5122,12 +5144,14 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_start = EXTENT_MAP_INLINE; - if (compressed) + if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) == - BTRFS_COMPRESS_ZLIB) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); @@ -6477,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; - ei->force_compress = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree, GFP_NOFS); @@ -7098,116 +7122,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode && (mode != FALLOC_FL_KEEP_SIZE)) - return -EOPNOTSUPP; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; - } - - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - if (ret < 0) { - free_extent_map(em); - break; - } - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); @@ -7215,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) + return -EROFS; if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, flags, btrfs_check_acl); @@ -7310,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; static const struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f87552a..a506a22 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int flags, oldflags; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -360,7 +363,8 @@ fail: } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid) + char *name, int namelen, u64 *async_transid, + bool readonly) { struct inode *inode; struct dentry *parent; @@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; pending_snapshot->root = root; + pending_snapshot->readonly = readonly; trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid) + u64 *async_transid, bool readonly) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (snap_src) { error = create_snapshot(snap_src, dentry, - name, namelen, async_transid); + name, namelen, async_transid, readonly); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen, async_transid); @@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct page *page; + struct btrfs_super_block *disk_super; unsigned long last_index; unsigned long ra_pages = root->fs_info->bdi.ra_pages; unsigned long total_read = 0; + u64 features; u64 page_start; u64 page_end; u64 last_len = 0; @@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file, u64 defrag_end = 0; unsigned long i; int ret; + int compress_type = BTRFS_COMPRESS_ZLIB; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } if (inode->i_size == 0) return 0; @@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file, total_read++; mutex_lock(&inode->i_mutex); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = 1; + BTRFS_I(inode)->force_compress = compress_type; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -781,10 +796,17 @@ loop_unlock: atomic_dec(&root->fs_info->async_submit_draining); mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = 0; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; mutex_unlock(&inode->i_mutex); } + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + } + return 0; err_reservations: @@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, - u64 *transid) + u64 *transid, + bool readonly) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; @@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid); + NULL, transid, readonly); } else { struct inode *src_inode; src_file = fget(fd); @@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid); + transid, readonly); fput(src_file); } out: @@ -946,58 +969,139 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol, - int v2) + void __user *arg, int subvol) { - struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; - char *name; - u64 fd; + struct btrfs_ioctl_vol_args *vol_args; int ret; - if (v2) { - u64 transid = 0; - u64 *ptr = NULL; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); - if (IS_ERR(vol_args_v2)) - return PTR_ERR(vol_args_v2); + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false); - if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out; - } - - name = vol_args_v2->name; - fd = vol_args_v2->fd; - vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + kfree(vol_args); + return ret; +} - if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) - ptr = &transid; +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, ptr); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) - ret = -EFAULT; - } else { - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - name = vol_args->name; - fd = vol_args->fd; - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, NULL); + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ret = -EOPNOTSUPP; + goto out; } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + ptr, readonly); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; out: kfree(vol_args); - kfree(vol_args_v2); + return ret; +} +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + return -EINVAL; + + if (flags & ~BTRFS_SUBVOL_RDONLY) + return -EOPNOTSUPP; + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + else + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out: + up_write(&root->fs_info->subvol_sem); return ret; } @@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file) if (file->private_data) goto out; + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + ret = mnt_want_write(file->f_path.mnt); if (ret) goto out; @@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0, 0); + return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create(file, argp, 0, 1); + return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1, 0); + return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c344d12..8fb3821 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args { }; #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { @@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args { */ __u32 extent_thresh; + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + /* spare for later */ - __u32 unused[5]; + __u32 unused[4]; }; struct btrfs_ioctl_space_info { @@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 0000000..cc9b450 --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,420 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/lzo.h> +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where compressed data goes */ + void *cbuf; /* where decompressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + ret = -1; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) + goto out; + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + ret = -1; + data_in = NULL; + goto done; + } + data_in = kmap(pages_in[page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed\n"); + ret = -1; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + if (data_in) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed!\n"); + ret = -1; + goto out; + } + + if (out_len < start_byte) { + ret = -1; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr, KM_USER0); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7737e..2b61e1d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio) + int type, int dio, int compress_type) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = inode; + entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0); + disk_len, type, 0, + BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1); + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 61dca83..ff1f69a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -68,7 +68,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ @@ -93,6 +93,9 @@ struct btrfs_ordered_extent { /* flags (described above) */ unsigned long flags; + /* compression algorithm */ + int compress_type; + /* reference count */ atomic_t refs; @@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 22acdaa..b2130c4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,90 @@ static const struct super_operations btrfs_super_ops; +static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + default: + if (nbuf) { + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +static void __save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +} + +/* NOTE: + * We move write_super stuff at umount in order to avoid deadlock + * for umount hold all lock. + */ +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + __save_error_info(fs_info); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + sb->s_flags |= MS_RDONLY; + printk(KERN_INFO "btrfs is forced readonly\n"); + } +} + +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno) +{ + struct super_block *sb = fs_info->sb; + char nbuf[16]; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(fs_info); + + btrfs_handle_error(fs_info); +} + static void btrfs_put_super(struct super_block *sb) { struct btrfs_root *root = btrfs_sb(sb); @@ -69,9 +153,9 @@ enum { Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, - Opt_user_subvol_rm_allowed, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, }; static match_table_t tokens = { @@ -86,7 +170,9 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *p, *num, *orig; int intarg; int ret = 0; + char *compress_type; + bool compress_force = false; if (!options) return 0; @@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; - case Opt_compress: - printk(KERN_INFO "btrfs: use compression\n"); - btrfs_set_opt(info->mount_opt, COMPRESS); - break; case Opt_compress_force: - printk(KERN_INFO "btrfs: forcing compression\n"); - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + case Opt_compress_force_type: + compress_force = true; + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + } else { + ret = -EINVAL; + goto out; + } + btrfs_set_opt(info->mount_opt, COMPRESS); + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + pr_info("btrfs: force %s compression\n", + compress_type); + } else + pr_info("btrfs: use %s compression\n", + compress_type); break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); @@ -753,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; } +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->rw_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + min_stripes = 4; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!device->in_fs_metadata) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * min_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - min_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); @@ -760,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 total_used_data = 0; + u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; + int ret; + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&root->fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_SYSTEM)) - total_used_data += found->disk_total; - else - total_used_data += found->disk_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + } + total_used += found->disk_used; } rcu_read_unlock(); @@ -778,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(root, &total_free_data); + if (ret) { + mutex_unlock(&root->fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&root->fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -897,10 +1136,14 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_cachep(); + err = btrfs_init_compress(); if (err) goto free_sysfs; + err = btrfs_init_cachep(); + if (err) + goto free_compress; + err = extent_io_init(); if (err) goto free_cachep; @@ -928,6 +1171,8 @@ free_extent_io: extent_io_exit(); free_cachep: btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); free_sysfs: btrfs_exit_sysfs(); return err; @@ -942,7 +1187,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); - btrfs_zlib_exit(); + btrfs_exit_compress(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f50e931..bae5c7b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; int ret; + + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return ERR_PTR(-EROFS); again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 to_reserve = 0; u64 index = 0; u64 objectid; + u64 root_flags; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { @@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); btrfs_set_lock_blocking(old); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f104b57..229a594 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -62,6 +62,7 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */ int error; + bool readonly; struct list_head list; }; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1718e1a..d158530 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -600,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) + if (!bh) { + ret = -EINVAL; goto error_close; + } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -703,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -729,59 +732,167 @@ error: return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = 1024 * 1024; - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + if (root->fs_info->alloc_start + num_bytes <= search_end) search_start = max(root->fs_info->alloc_start, search_start); + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -790,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -815,48 +911,62 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - BUG_ON(*start < search_start); - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); +error: + *start = max_hole_start; + if (len) + *len = max_hole_size; return ret; } @@ -1196,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1916,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; @@ -2154,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size * num_stripes; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int looped = 0; - int ret; - int index; - int stripe_len = 64 * 1024; + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; +static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, + int *num_stripes, int *min_stripes, + int *sub_stripes) +{ + *num_stripes = 1; + *min_stripes = 1; + *sub_stripes = 0; if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; + *num_stripes = fs_devices->rw_devices; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) + *num_stripes = fs_devices->rw_devices; + if (*num_stripes < 4) return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - min_stripes = 4; + *num_stripes &= ~(u32)1; + *sub_stripes = 2; + *min_stripes = 4; } + return 0; +} + +static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, + u64 proposed_size, u64 type, + int num_stripes, int small_stripe) +{ + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = proposed_size; + u64 max_chunk_size = calc_size; + int ncopies = 1; + + if (type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) + ncopies = 2; + if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; @@ -2230,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; + if (calc_size * num_stripes > max_chunk_size * ncopies) { + calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; } /* we don't want tiny stripes */ - if (!looped) + if (!small_stripe) calc_size = max_t(u64, min_stripe_size, calc_size); /* - * we're about to do_div by the stripe_len so lets make sure + * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure * we end up with something bigger than a stripe */ - calc_size = max_t(u64, calc_size, stripe_len * 4); + calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); + + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; + + return calc_size; +} + +static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, + int num_stripes) +{ + struct map_lookup *new; + size_t len = map_lookup_size(num_stripes); + + BUG_ON(map->num_stripes < num_stripes); + + if (map->num_stripes == num_stripes) + return map; + + new = kmalloc(len, GFP_NOFS); + if (!new) { + /* just change map->num_stripes */ + map->num_stripes = num_stripes; + return map; + } + + memcpy(new, map, len); + new->num_stripes = num_stripes; + kfree(map); + return new; +} + +/* + * helper to allocate device space from btrfs_device_info, in which we stored + * max free space information of every device. It is used when we can not + * allocate chunks by default size. + * + * By this helper, we can allocate a new chunk as larger as possible. + */ +static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_devices *fs_devices, + struct btrfs_device_info *devices, + int nr_device, u64 type, + struct map_lookup **map_lookup, + int min_stripes, u64 *stripe_size) +{ + int i, index, sort_again = 0; + int min_devices = min_stripes; + u64 max_avail, min_free; + struct map_lookup *map = *map_lookup; + int ret; + + if (nr_device < min_stripes) + return -ENOSPC; + + btrfs_descending_sort_devices(devices, nr_device); + + max_avail = devices[0].max_avail; + if (!max_avail) + return -ENOSPC; + + for (i = 0; i < nr_device; i++) { + /* + * if dev_offset = 0, it means the free space of this device + * is less than what we need, and we didn't search max avail + * extent on this device, so do it now. + */ + if (!devices[i].dev_offset) { + ret = find_free_dev_extent(trans, devices[i].dev, + max_avail, + &devices[i].dev_offset, + &devices[i].max_avail); + if (ret != 0 && ret != -ENOSPC) + return ret; + sort_again = 1; + } + } + + /* we update the max avail free extent of each devices, sort again */ + if (sort_again) + btrfs_descending_sort_devices(devices, nr_device); + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_devices = 1; + + if (!devices[min_devices - 1].max_avail) + return -ENOSPC; + + max_avail = devices[min_devices - 1].max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(max_avail, 2); + + max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, + min_stripes, 1); + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = max_avail * 2; + else + min_free = max_avail; + + if (min_free > devices[min_devices - 1].max_avail) + return -ENOSPC; + + map = __shrink_map_lookup_stripes(map, min_stripes); + *stripe_size = max_avail; + + index = 0; + for (i = 0; i < min_stripes; i++) { + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset; + if (type & BTRFS_BLOCK_GROUP_DUP) { + i++; + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset + + max_avail; + } + index++; + } + *map_lookup = map; + + return 0; +} - do_div(calc_size, stripe_len); - calc_size *= stripe_len; +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info; + struct list_head private_devs; + u64 calc_size = 1024 * 1024 * 1024; + u64 min_free; + u64 avail; + u64 dev_offset; + int num_stripes; + int min_stripes; + int sub_stripes; + int min_devices; /* the min number of devices we need */ + int i; + int ret; + int index; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, + &min_stripes, &sub_stripes); + if (ret) + return ret; + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; cur = fs_devices->alloc_list.next; index = 0; + i = 0; - if (type & BTRFS_BLOCK_GROUP_DUP) + calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, + num_stripes, 0); + + if (type & BTRFS_BLOCK_GROUP_DUP) { min_free = calc_size * 2; - else + min_devices = 1; + } else { min_free = calc_size; - - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; + min_devices = min_stripes; + } INIT_LIST_HEAD(&private_devs); while (index < num_stripes) { @@ -2287,27 +2559,39 @@ again: cur = cur->next; if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + ret = find_free_dev_extent(trans, device, min_free, + &devices_info[i].dev_offset, + &devices_info[i].max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; + map->stripes[index].physical = + devices_info[i].dev_offset; index++; if (type & BTRFS_BLOCK_GROUP_DUP) { map->stripes[index].dev = device; map->stripes[index].physical = - dev_offset + calc_size; + devices_info[i].dev_offset + + calc_size; index++; } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } else if (ret != -ENOSPC) + goto error; + + devices_info[i].dev = device; + i++; + } else if (device->in_fs_metadata && + avail >= BTRFS_STRIPE_LEN) { + devices_info[i].dev = device; + devices_info[i].max_avail = avail; + i++; + } + if (cur == &fs_devices->alloc_list) break; } + list_splice(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { @@ -2316,34 +2600,36 @@ again: num_stripes /= sub_stripes; num_stripes *= sub_stripes; } - looped = 1; - goto again; - } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - goto again; + + map = __shrink_map_lookup_stripes(map, num_stripes); + } else if (i >= min_devices) { + ret = __btrfs_alloc_tiny_space(trans, fs_devices, + devices_info, i, type, + &map, min_stripes, + &calc_size); + if (ret) + goto error; + } else { + ret = -ENOSPC; + goto error; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; *map_ret = map; *stripe_size = calc_size; *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + map->num_stripes, sub_stripes); em = alloc_extent_map(GFP_NOFS); if (!em) { - kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } em->bdev = (struct block_device *)map; em->start = start; @@ -2376,7 +2662,13 @@ again: index++; } + kfree(devices_info); return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; } static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1be7810..7fb59d4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,8 +20,11 @@ #define __BTRFS_VOLUMES_ #include <linux/bio.h> +#include <linux/sort.h> #include "async-thread.h" +#define BTRFS_STRIPE_LEN (64 * 1024) + struct buffer_head; struct btrfs_pending_bios { struct bio *head; @@ -136,6 +139,30 @@ struct btrfs_multi_bio { struct btrfs_bio_stripe stripes[]; }; +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; +}; + +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 698fdd2..a577653 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, int btrfs_removexattr(struct dentry *dentry, const char *name) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b9cd544..f5ec2d4 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -32,15 +32,6 @@ #include <linux/bio.h> #include "compression.h" -/* Plan: call deflate() with avail_in == *sourcelen, - avail_out = *dstlen - 12 and flush == Z_FINISH. - If it doesn't manage to finish, call it again with - avail_in == 0 and avail_out set to the remaining 12 - bytes for it to clean up. - Q: Is 12 bytes sufficient? -*/ -#define STREAM_END_SPACE 12 - struct workspace { z_stream inf_strm; z_stream def_strm; @@ -48,152 +39,51 @@ struct workspace { struct list_head list; }; -static LIST_HEAD(idle_workspace); -static DEFINE_SPINLOCK(workspace_lock); -static unsigned long num_workspace; -static atomic_t alloc_workspace = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); +static void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); -/* - * this finds an available zlib workspace or allocates a new one - * NULL or an ERR_PTR is returned if things go bad. - */ -static struct workspace *find_zlib_workspace(void) + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; - int ret; - int cpus = num_online_cpus(); - -again: - spin_lock(&workspace_lock); - if (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - num_workspace--; - spin_unlock(&workspace_lock); - return workspace; - } - spin_unlock(&workspace_lock); - if (atomic_read(&alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(&alloc_workspace) > cpus) - schedule(); - finish_wait(&workspace_wait, &wait); - goto again; - } - atomic_inc(&alloc_workspace); workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) { - ret = -ENOMEM; - goto fail; - } + if (!workspace) + return ERR_PTR(-ENOMEM); workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); - if (!workspace->def_strm.workspace) { - ret = -ENOMEM; - goto fail; - } workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - if (!workspace->inf_strm.workspace) { - ret = -ENOMEM; - goto fail_inflate; - } workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->buf) { - ret = -ENOMEM; - goto fail_kmalloc; - } - return workspace; - -fail_kmalloc: - vfree(workspace->inf_strm.workspace); -fail_inflate: - vfree(workspace->def_strm.workspace); -fail: - kfree(workspace); - atomic_dec(&alloc_workspace); - wake_up(&workspace_wait); - return ERR_PTR(ret); -} - -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static int free_workspace(struct workspace *workspace) -{ - spin_lock(&workspace_lock); - if (num_workspace < num_online_cpus()) { - list_add_tail(&workspace->list, &idle_workspace); - num_workspace++; - spin_unlock(&workspace_lock); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; - } - spin_unlock(&workspace_lock); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; - atomic_dec(&alloc_workspace); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; -} + INIT_LIST_HEAD(&workspace->list); -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct workspace *workspace; - while (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); - atomic_dec(&alloc_workspace); - } + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); } -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - struct workspace *workspace; char *data_in; char *cpage_out; int nr_pages = 0; @@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_out = 0; *total_in = 0; - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -1; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "deflateInit failed\n"); ret = -1; @@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; @@ -314,55 +208,26 @@ out: kunmap(in_page); page_cache_release(in_page); } - free_workspace(workspace); return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) { - int ret = 0; + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; int wbits = MAX_WBITS; - struct workspace *workspace; char *data_in; size_t total_out = 0; - unsigned long page_bytes_left; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - struct page *page_out; unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long working_bytes; unsigned long pg_offset; - unsigned long start_byte; - unsigned long current_buf_start; - char *kaddr; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; data_in = kmap(pages_in[page_in_index]); workspace->inf_strm.next_in = data_in; @@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, workspace->inf_strm.total_out = 0; workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - page_out = bvec[page_out_index].bv_page; - page_bytes_left = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; - /* - * buf start is the byte offset we're of the start of - * our workspace buffer - */ - buf_start = total_out; - /* total_out is the last byte of the workspace buffer */ + buf_start = total_out; total_out = workspace->inf_strm.total_out; - working_bytes = total_out - buf_start; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - if (working_bytes == 0) { - /* we didn't make progress in this inflate - * call, we're done - */ - if (ret != Z_STREAM_END) - ret = -1; + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) break; - } - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - goto next; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, - bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page_out); - - pg_offset += bytes; - page_bytes_left -= bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (page_bytes_left == 0) { - page_out_index++; - if (page_out_index >= vcnt) { - ret = 0; - goto done; - } - - page_out = bvec[page_out_index].bv_page; - pg_offset = 0; - page_bytes_left = PAGE_CACHE_SIZE; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - goto next; - - /* the next page in the biovec might not - * be adjacent to the last page, but it - * might still be found inside this working - * buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + - buf_offset; - } - } + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; } -next: + workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; @@ -516,35 +301,21 @@ done: zlib_inflateEnd(&workspace->inf_strm); if (data_in) kunmap(pages_in[page_in_index]); -out: - free_workspace(workspace); return ret; } -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - struct workspace *workspace; unsigned long bytes_left = destlen; unsigned long total_out = 0; char *kaddr; - if (destlen > PAGE_CACHE_SIZE) - return -ENOMEM; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; - workspace->inf_strm.next_in = data_in; workspace->inf_strm.avail_in = srclen; workspace->inf_strm.total_in = 0; @@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (bytes_left > 0) { @@ -616,12 +386,13 @@ next: ret = 0; zlib_inflateEnd(&workspace->inf_strm); -out: - free_workspace(workspace); return ret; } -void btrfs_zlib_exit(void) -{ - free_workspaces(); -} +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index c68a056..7ed3653 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, } -static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, - struct list_head *mntlist) -{ - /* stolen from afs code */ - int err; - - mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist); - switch (err) { - case 0: - path_put(&nd->path); - nd->path.mnt = newmnt; - nd->path.dentry = dget(newmnt->mnt_root); - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); - break; - case -EBUSY: - /* someone else made a mount here whilst we were busy */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - default: - mntput(newmnt); - break; - } - return err; -} - static void dump_referral(const struct dfs_info3_param *ref) { cFYI(1, "DFS: ref path: %s", ref->path_name); @@ -293,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref) ref->path_consumed); } - -static void* -cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +/* + * Create a vfsmount that we can automount + */ +static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) { struct dfs_info3_param *referrals = NULL; unsigned int num_referrals = 0; struct cifs_sb_info *cifs_sb; struct cifsSesInfo *ses; - char *full_path = NULL; + char *full_path; int xid, i; - int rc = 0; - struct vfsmount *mnt = ERR_PTR(-ENOENT); + int rc; + struct vfsmount *mnt; struct tcon_link *tlink; cFYI(1, "in %s", __func__); - BUG_ON(IS_ROOT(dentry)); + BUG_ON(IS_ROOT(mntpt)); xid = GetXid(); - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); - /* * The MSDFS spec states that paths in DFS referral requests and * responses must be prefixed by a single '\' character instead of * the double backslashes usually used in the UNC. This function * gives us the latter, so we must adjust the result. */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; - goto out_err; - } + mnt = ERR_PTR(-ENOMEM); + full_path = build_path_from_dentry(mntpt); + if (full_path == NULL) + goto free_xid; - cifs_sb = CIFS_SB(dentry->d_inode->i_sb); + cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); + mnt = ERR_PTR(-EINVAL); if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - goto out_err; + mnt = ERR_CAST(tlink); + goto free_full_path; } ses = tlink_tcon(tlink)->ses; @@ -341,46 +310,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) cifs_put_tlink(tlink); + mnt = ERR_PTR(-ENOENT); for (i = 0; i < num_referrals; i++) { int len; - dump_referral(referrals+i); + dump_referral(referrals + i); /* connect to a node */ len = strlen(referrals[i].node_name); if (len < 2) { cERROR(1, "%s: Net Address path too short: %s", __func__, referrals[i].node_name); - rc = -EINVAL; - goto out_err; + mnt = ERR_PTR(-EINVAL); + break; } mnt = cifs_dfs_do_refmount(cifs_sb, full_path, referrals + i); cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, referrals[i].node_name, mnt); - - /* complete mount procedure if we accured submount */ if (!IS_ERR(mnt)) - break; + goto success; } - /* we need it cause for() above could exit without valid submount */ - rc = PTR_ERR(mnt); - if (IS_ERR(mnt)) - goto out_err; - - rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); + /* no valid submounts were found; return error from get_dfs_path() by + * preference */ + if (rc != 0) + mnt = ERR_PTR(rc); -out: - FreeXid(xid); +success: free_dfs_info_array(referrals, num_referrals); +free_full_path: kfree(full_path); +free_xid: + FreeXid(xid); cFYI(1, "leaving %s" , __func__); - return ERR_PTR(rc); -out_err: - path_put(&nd->path); - goto out; + return mnt; +} + +/* + * Attempt to automount the referral + */ +struct vfsmount *cifs_dfs_d_automount(struct path *path) +{ + struct vfsmount *newmnt; + + cFYI(1, "in %s", __func__); + + newmnt = cifs_dfs_do_automount(path->dentry); + if (IS_ERR(newmnt)) { + cFYI(1, "leaving %s [automount failed]" , __func__); + return newmnt; + } + + mntget(newmnt); /* prevent immediate expiration */ + mnt_set_expiry(newmnt, &cifs_dfs_automount_list); + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); + cFYI(1, "leaving %s [ok]" , __func__); + return newmnt; } const struct inode_operations cifs_dfs_referral_inode_operations = { - .follow_link = cifs_dfs_follow_mountpoint, }; - diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 897b2b2..851030f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -93,6 +93,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); extern const struct dentry_operations cifs_dentry_ops; extern const struct dentry_operations cifs_ci_dentry_ops; +#ifdef CONFIG_CIFS_DFS_UPCALL +extern struct vfsmount *cifs_dfs_d_automount(struct path *path); +#else +#define cifs_dfs_d_automount NULL +#endif + /* Functions related to symlinks */ extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); extern void cifs_put_link(struct dentry *direntry, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a65d311..9f59887 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1113,6 +1113,8 @@ cifs_parse_mount_options(char *options, const char *devname, } else if (!strnicmp(data, "uid", 3) && value && *value) { vol->linux_uid = simple_strtoul(value, &value, 0); uid_specified = true; + } else if (!strnicmp(data, "cruid", 5) && value && *value) { + vol->cred_uid = simple_strtoul(value, &value, 0); } else if (!strnicmp(data, "forceuid", 8)) { override_uid = 1; } else if (!strnicmp(data, "noforceuid", 10)) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1e95dd6..dd5f229 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -675,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) const struct dentry_operations cifs_dentry_ops = { .d_revalidate = cifs_d_revalidate, + .d_automount = cifs_dfs_d_automount, /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; @@ -711,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = { .d_revalidate = cifs_d_revalidate, .d_hash = cifs_ci_hash, .d_compare = cifs_ci_compare, + .d_automount = cifs_dfs_d_automount, }; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b06b606..6c9ee80 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -32,7 +32,7 @@ #include "fscache.h" -static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) +static void cifs_set_ops(struct inode *inode) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -60,7 +60,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) break; case S_IFDIR: #ifdef CONFIG_CIFS_DFS_UPCALL - if (is_dfs_referral) { + if (IS_AUTOMOUNT(inode)) { inode->i_op = &cifs_dfs_referral_inode_operations; } else { #else /* NO DFS support, treat as a directory */ @@ -167,7 +167,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } spin_unlock(&inode->i_lock); - cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); + if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + inode->i_flags |= S_AUTOMOUNT; + cifs_set_ops(inode); } void diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 9aad47a..6783ce6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr) } /* else ERRHRD class errors or junk - return EIO */ - cFYI(1, "Mapping smb error code %d to POSIX err %d", - smberrcode, rc); + cFYI(1, "Mapping smb error code 0x%x to POSIX err %d", + le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on * ERRbaduid could be added */ diff --git a/fs/compat.c b/fs/compat.c index eb1740a..f6fd0a0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * } /* - * The following statfs calls are copies of code from fs/open.c and + * The following statfs calls are copies of code from fs/statfs.c and * should be checked against those from time to time */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) @@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat __put_user(kbuf->f_namelen, &ubuf->f_namelen) || __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize)) + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) return -EFAULT; return 0; } @@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type, if (nr_segs > fast_segs) { ret = -ENOMEM; iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) { - *ret_pointer = fast_pointer; + if (iov == NULL) goto out; - } } *ret_pointer = iov; diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 13587cc..9febcde 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,8 +1,8 @@ config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - depends on SYSFS + select SYSFS help - configfs is a ram-based filesystem that provides the converse + configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based view of kernel objects, configfs is a filesystem-based manager of kernel objects, or config_items. diff --git a/fs/dcache.c b/fs/dcache.c index 274a222..9f493ee 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1380,8 +1380,11 @@ EXPORT_SYMBOL(d_set_d_op); static void __d_instantiate(struct dentry *dentry, struct inode *inode) { spin_lock(&dentry->d_lock); - if (inode) + if (inode) { + if (unlikely(IS_AUTOMOUNT(inode))) + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; list_add(&dentry->d_alias, &inode->i_dentry); + } dentry->d_inode = inode; dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index 2dbb422..1897eb1 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -1,8 +1,7 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" depends on EXPERIMENTAL && INET - depends on SYSFS && (IPV6 || IPV6=n) - select CONFIGFS_FS + depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP help A general purpose distributed lock manager for kernel or userspace diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index cbadc1b..bfd8b68 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, BUG_ON(!crypt_stat || !crypt_stat->tfm || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", + ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); ecryptfs_dump_hex(crypt_stat->key, crypt_stat->key_size); @@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to " - "derive IV for extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); goto out; } if (unlikely(ecryptfs_verbosity > 0)) { @@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, } rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; " + "rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " "encryption:\n"); ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); @@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page, rc = ecryptfs_derive_iv(extent_iv, crypt_stat, (extent_base + extent_offset)); if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to " - "derive IV for extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " + "extent [0x%.16llx]; rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); goto out; } if (unlikely(ecryptfs_verbosity > 0)) { @@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page, } rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; " - "rc = [%d]\n", (extent_base + extent_offset), - rc); + ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; " + "rc = [%d]\n", + (unsigned long long)(extent_base + extent_offset), rc); ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " "decryption:\n"); ecryptfs_dump_hex((char *)(page_address(page) @@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) } ecryptfs_printk(KERN_DEBUG, "Initializing cipher [%s]; strlen = [%d]; " - "key_size_bits = [%d]\n", + "key_size_bits = [%zd]\n", crypt_stat->cipher, (int)strlen(crypt_stat->cipher), crypt_stat->key_size << 3); if (crypt_stat->tfm) { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 413a3c4..dbc84ed 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key) (((struct user_key_payload*)key->payload.data)->data); } -#define ECRYPTFS_SUPER_MAGIC 0xf15f #define ECRYPTFS_MAX_KEYSET_SIZE 1024 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 @@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); +__attribute__ ((format(printf, 1, 2))) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 91da029..81e10e6 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - int rc; + ssize_t rc; struct dentry *lower_dentry; struct vfsmount *lower_vfsmount; struct file *file = iocb->ki_filp; @@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if (!ecryptfs_inode_to_private(inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out_free; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out_free; } - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) + == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) { rc = -EPERM; printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " "file must hence be opened RO\n", __func__); @@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } } mutex_unlock(&crypt_stat->cs_mutex); - ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " - "size: [0x%.16x]\n", inode, inode->i_ino, - i_size_read(inode)); + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " + "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + (unsigned long long)i_size_read(inode)); goto out; out_free: kmem_cache_free(ecryptfs_file_info_cache, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 64ff023..bd33f87 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } - if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; } rc = ecryptfs_write_metadata(ecryptfs_dentry); if (rc) { @@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, rc = -ENOMEM; goto out; } - if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(ecryptfs_dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize " - "the persistent file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); - goto out_free_kmem; - } + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out_free_kmem; } crypt_stat = &ecryptfs_inode_to_private( ecryptfs_dentry->d_inode)->crypt_stat; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index b1f6858..c1436cf 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -59,7 +59,7 @@ static int process_request_key_err(long err_code) break; default: ecryptfs_printk(KERN_WARNING, "Unknown error code: " - "[0x%.16x]\n", err_code); + "[0x%.16lx]\n", err_code); rc = -EINVAL; } return rc; @@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size, } else { rc = -EINVAL; ecryptfs_printk(KERN_WARNING, - "Unsupported packet size: [%d]\n", size); + "Unsupported packet size: [%zd]\n", size); } return rc; } @@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, auth_tok->session_key.decrypted_key_size); crypt_stat->flags |= ECRYPTFS_KEY_VALID; if (unlikely(ecryptfs_verbosity > 0)) { - ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n", + ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n", crypt_stat->key_size); ecryptfs_dump_hex(crypt_stat->key, crypt_stat->key_size); @@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { ecryptfs_printk(KERN_ERR, "Expected " "signature of size [%d]; " - "read size [%d]\n", + "read size [%zd]\n", ECRYPTFS_SIG_SIZE, tag_11_contents_size); rc = -EIO; @@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, goto out_wipe_list; break; default: - ecryptfs_printk(KERN_DEBUG, "No packet at offset " - "[%d] of the file header; hex value of " + ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " + "of the file header; hex value of " "character is [0x%.2x]\n", i, src[i]); next_packet_is_auth_tok_packet = 0; } @@ -1864,8 +1864,8 @@ found_matching_auth_tok: "session key for authentication token with sig " "[%.*s]; rc = [%d]. Removing auth tok " "candidate from the list and searching for " - "the next match.\n", candidate_auth_tok_sig, - ECRYPTFS_SIG_SIZE_HEX, rc); + "the next match.\n", ECRYPTFS_SIG_SIZE_HEX, + candidate_auth_tok_sig, rc); list_for_each_entry_safe(auth_tok_list_item, auth_tok_list_item_tmp, &auth_tok_list, list) { @@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (encrypted_session_key_valid) { ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; " "using auth_tok->session_key.encrypted_key, " - "where key_rec->enc_key_size = [%d]\n", + "where key_rec->enc_key_size = [%zd]\n", key_rec->enc_key_size); memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, @@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, if (rc < 1 || rc > 2) { ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat session key; expected rc = 1; " - "got rc = [%d]. key_rec->enc_key_size = [%d]\n", + "got rc = [%d]. key_rec->enc_key_size = [%zd]\n", rc, key_rec->enc_key_size); rc = -ENOMEM; goto out; @@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, ecryptfs_printk(KERN_ERR, "Error generating scatterlist " "for crypt_stat encrypted session key; " "expected rc = 1; got rc = [%d]. " - "key_rec->enc_key_size = [%d]\n", rc, + "key_rec->enc_key_size = [%zd]\n", rc, key_rec->enc_key_size); rc = -ENOMEM; goto out; @@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, goto out; } rc = 0; - ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", + ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", crypt_stat->key_size); rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, (*key_rec).enc_key_size); @@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, } ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); if (ecryptfs_verbosity > 0) { - ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n", + ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n", key_rec->enc_key_size); ecryptfs_dump_hex(key_rec->enc_key, key_rec->enc_key_size); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d3b28ab..758323a 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -36,6 +36,7 @@ #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> +#include <linux/magic.h> #include "ecryptfs_kernel.h" /** @@ -564,6 +565,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags ecryptfs_set_superblock_lower(s, path.dentry->d_sb); s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; + s->s_magic = ECRYPTFS_SUPER_MAGIC; inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); @@ -808,9 +810,10 @@ static int __init ecryptfs_init(void) ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " - "default eCryptfs extent size is [%d] bytes; " - "the page size is [%d] bytes.\n", - ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); + "default eCryptfs extent size is [%u] bytes; " + "the page size is [%lu] bytes.\n", + ECRYPTFS_DEFAULT_EXTENT_SIZE, + (unsigned long)PAGE_CACHE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index b1d8275..cc64fca 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting " - "page (upper index [0x%.16x])\n", page->index); + "page (upper index [0x%.16lx])\n", page->index); ClearPageUptodate(page); goto out; } @@ -237,7 +237,7 @@ out: ClearPageUptodate(page); else SetPageUptodate(page); - ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", page->index); unlock_page(page); return rc; @@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file, return -ENOMEM; *pagep = page; + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; @@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file, SetPageUptodate(page); } } else { - rc = ecryptfs_decrypt_page(page); - if (rc) { - printk(KERN_ERR "%s: Error decrypting page " - "at index [%ld]; rc = [%d]\n", - __func__, page->index, rc); - ClearPageUptodate(page); - goto out; + if (prev_page_end_size + >= i_size_read(page->mapping->host)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + } else { + rc = ecryptfs_decrypt_page(page); + if (rc) { + printk(KERN_ERR "%s: Error decrypting " + "page at index [%ld]; " + "rc = [%d]\n", + __func__, page->index, rc); + ClearPageUptodate(page); + goto out; + } } SetPageUptodate(page); } } - prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ if (index != 0) { @@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file, } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" - "(page w/ index = [0x%.16x], to = [%d])\n", index, to); + "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, to); @@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file, rc = fill_zeros_to_end_of_page(page, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " - "zeros in page with index = [0x%.16x]\n", index); + "zeros in page with index = [0x%.16lx]\n", index); goto out; } rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16x])\n", index); + "index [0x%.16lx])\n", index); goto out; } if (pos + copied > i_size_read(ecryptfs_inode)) { i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " - "[0x%.16x]\n", i_size_read(ecryptfs_inode)); + "[0x%.16llx]\n", + (unsigned long long)i_size_read(ecryptfs_inode)); } rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); if (rc) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1de65f5..0c8d97b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); -extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c4068f6..63a7581 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode, } /* - * preallocate space for a file. This implements ext4's fallocate inode + * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ -long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; handle_t *handle; loff_t new_size; unsigned int max_blocks; @@ -3645,7 +3646,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) unsigned int credits, blkbits = inode->i_blkbits; /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode && (mode != FALLOC_FL_KEEP_SIZE)) + if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; /* @@ -3655,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; - /* preallocation to directories is currently not supported */ - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bb003dc..2e8322c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = { .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, }; const struct inode_operations ext4_file_inode_operations = { @@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = { .removexattr = generic_removexattr, #endif .check_acl = ext4_check_acl, - .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/file_table.c b/fs/file_table.c index c3dee38..c3e89ad 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed) struct files_struct *files = current->files; *fput_needed = 0; - if (likely((atomic_read(&files->count) == 1))) { + if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); } else { rcu_read_lock(); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 68ca487..78b519c 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -4,6 +4,19 @@ #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> +#include "internal.h" + +static inline void path_get_longterm(struct path *path) +{ + path_get(path); + mnt_make_longterm(path->mnt); +} + +static inline void path_put_longterm(struct path *path) +{ + mnt_make_shortterm(path->mnt); + path_put(path); +} /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. @@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path) write_seqcount_begin(&fs->seq); old_root = fs->root; fs->root = *path; - path_get_long(path); + path_get_longterm(path); write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) - path_put_long(&old_root); + path_put_longterm(&old_root); } /* @@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path) write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - path_get_long(path); + path_get_longterm(path); write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_pwd.dentry) - path_put_long(&old_pwd); + path_put_longterm(&old_pwd); } void chroot_fs_refs(struct path *old_root, struct path *new_root) @@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) write_seqcount_begin(&fs->seq); if (fs->root.dentry == old_root->dentry && fs->root.mnt == old_root->mnt) { - path_get_long(new_root); + path_get_longterm(new_root); fs->root = *new_root; count++; } if (fs->pwd.dentry == old_root->dentry && fs->pwd.mnt == old_root->mnt) { - path_get_long(new_root); + path_get_longterm(new_root); fs->pwd = *new_root; count++; } @@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) } while_each_thread(g, p); read_unlock(&tasklist_lock); while (count--) - path_put_long(old_root); + path_put_longterm(old_root); } void free_fs_struct(struct fs_struct *fs) { - path_put_long(&fs->root); - path_put_long(&fs->pwd); + path_put_longterm(&fs->root); + path_put_longterm(&fs->pwd); kmem_cache_free(fs_cachep, fs); } @@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) spin_lock(&old->lock); fs->root = old->root; - path_get_long(&fs->root); + path_get_longterm(&fs->root); fs->pwd = old->pwd; - path_get_long(&fs->pwd); + path_get_longterm(&fs->pwd); spin_unlock(&old->lock); } return fs; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index fca6689..7cfdcb9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -19,6 +19,8 @@ #include <linux/fs.h> #include <linux/gfs2_ondisk.h> #include <linux/ext2_fs.h> +#include <linux/falloc.h> +#include <linux/swap.h> #include <linux/crc32.h> #include <linux/writeback.h> #include <asm/uaccess.h> @@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return generic_file_aio_write(iocb, iov, nr_segs, pos); } +static void empty_write_end(struct page *page, unsigned from, + unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + + page_zero_new_buffers(page, from, to); + flush_dcache_page(page); + mark_page_accessed(page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + block_commit_write(page, from, to); +} + +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) +{ + unsigned start, end, next; + struct buffer_head *bh, *head; + int error; + + if (!page_has_buffers(page)) { + error = __block_write_begin(page, from, to - from, gfs2_block_map); + if (unlikely(error)) + return error; + + empty_write_end(page, from, to); + return 0; + } + + bh = head = page_buffers(page); + next = end = 0; + while (next < from) { + next += bh->b_size; + bh = bh->b_this_page; + } + start = next; + do { + next += bh->b_size; + if (buffer_mapped(bh)) { + if (end) { + error = __block_write_begin(page, start, end - start, + gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + end = 0; + } + start = next; + } + else + end = next; + bh = bh->b_this_page; + } while (next < to); + + if (end) { + error = __block_write_begin(page, start, end - start, gfs2_block_map); + if (unlikely(error)) + return error; + empty_write_end(page, start, end); + } + + return 0; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + u64 start = offset >> PAGE_CACHE_SHIFT; + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + pgoff_t curr; + struct page *page; + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; + unsigned int from, to; + + if (!end_offset) + end_offset = PAGE_CACHE_SIZE; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + goto out; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (unlikely(error)) + goto out; + } + + curr = start; + offset = start << PAGE_CACHE_SHIFT; + from = start_offset; + to = PAGE_CACHE_SIZE; + while (curr <= end) { + page = grab_cache_page_write_begin(inode->i_mapping, curr, + AOP_FLAG_NOFS); + if (unlikely(!page)) { + error = -ENOMEM; + goto out; + } + + if (curr == end) + to = end_offset; + error = write_empty_blocks(page, from, to); + if (!error && offset + to > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + i_size_write(inode, offset + to); + } + unlock_page(page); + page_cache_release(page); + if (error) + goto out; + curr++; + offset += PAGE_CACHE_SIZE; + from = 0; + } + + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + + brelse(dibh); + +out: + return error; +} + +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, + so it might end up with fewer data blocks */ + if (max_data <= *data_blocks) + return; + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes; + struct gfs2_alloc *al; + int error; + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << + sdp->sd_sb.sb_bsize_shift; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + if (!gfs2_write_alloc_required(ip, offset, len)) + goto out_unlock; + + while (len > 0) { + if (len < bytes) + bytes = len; + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + +retry: + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) { + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { + bytes >>= 1; + goto retry; + } + goto out_qunlock; + } + max_bytes = bytes; + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(al); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + goto out_unlock; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = gfs2_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops = { @@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = { .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .setlease = generic_setlease, + .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops_nolock = { diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 040b5a2..d8b26ac 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -18,8 +18,6 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/fiemap.h> -#include <linux/swap.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -1257,261 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return ret; } -static void empty_write_end(struct page *page, unsigned from, - unsigned to) -{ - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - - page_zero_new_buffers(page, from, to); - flush_dcache_page(page); - mark_page_accessed(page); - - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - block_commit_write(page, from, to); -} - - -static int write_empty_blocks(struct page *page, unsigned from, unsigned to) -{ - unsigned start, end, next; - struct buffer_head *bh, *head; - int error; - - if (!page_has_buffers(page)) { - error = __block_write_begin(page, from, to - from, gfs2_block_map); - if (unlikely(error)) - return error; - - empty_write_end(page, from, to); - return 0; - } - - bh = head = page_buffers(page); - next = end = 0; - while (next < from) { - next += bh->b_size; - bh = bh->b_this_page; - } - start = next; - do { - next += bh->b_size; - if (buffer_mapped(bh)) { - if (end) { - error = __block_write_begin(page, start, end - start, - gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - end = 0; - } - start = next; - } - else - end = next; - bh = bh->b_this_page; - } while (next < to); - - if (end) { - error = __block_write_begin(page, start, end - start, gfs2_block_map); - if (unlikely(error)) - return error; - empty_write_end(page, start, end); - } - - return 0; -} - -static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, - int mode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *dibh; - int error; - u64 start = offset >> PAGE_CACHE_SHIFT; - unsigned int start_offset = offset & ~PAGE_CACHE_MASK; - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; - pgoff_t curr; - struct page *page; - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; - unsigned int from, to; - - if (!end_offset) - end_offset = PAGE_CACHE_SIZE; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(error)) - goto out; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - - if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); - if (unlikely(error)) - goto out; - } - - curr = start; - offset = start << PAGE_CACHE_SHIFT; - from = start_offset; - to = PAGE_CACHE_SIZE; - while (curr <= end) { - page = grab_cache_page_write_begin(inode->i_mapping, curr, - AOP_FLAG_NOFS); - if (unlikely(!page)) { - error = -ENOMEM; - goto out; - } - - if (curr == end) - to = end_offset; - error = write_empty_blocks(page, from, to); - if (!error && offset + to > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - i_size_write(inode, offset + to); - } - unlock_page(page); - page_cache_release(page); - if (error) - goto out; - curr++; - offset += PAGE_CACHE_SIZE; - from = 0; - } - - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - - brelse(dibh); - -out: - return error; -} - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) -{ - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; - unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); - - for (tmp = max_data; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - max_data -= tmp; - } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; - *data_blocks = max_data; - *ind_blocks = max_blocks - max_data; - *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; - if (*len > max) { - *len = max; - gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); - } -} - -static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, - loff_t len) -{ - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *ip = GFS2_I(inode); - unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; - struct gfs2_alloc *al; - int error; - loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; - next = (next + 1) << sdp->sd_sb.sb_bsize_shift; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode && (mode != FALLOC_FL_KEEP_SIZE)) - return -EOPNOTSUPP; - - offset = (offset >> sdp->sd_sb.sb_bsize_shift) << - sdp->sd_sb.sb_bsize_shift; - - len = next - offset; - bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; - if (!bytes) - bytes = UINT_MAX; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (unlikely(error)) - goto out_uninit; - - if (!gfs2_write_alloc_required(ip, offset, len)) - goto out_unlock; - - while (len > 0) { - if (len < bytes) - bytes = len; - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_unlock; - } - - error = gfs2_quota_lock_check(ip); - if (error) - goto out_alloc_put; - -retry: - gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); - if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - goto retry; - } - goto out_qunlock; - } - max_bytes = bytes; - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - - rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + - RES_RG_HDR + gfs2_rg_blocks(al); - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - - error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); - if (error) - goto out_trans_fail; - - error = fallocate_chunk(inode, offset, max_bytes, mode); - gfs2_trans_end(sdp); - - if (error) - goto out_trans_fail; - - len -= max_bytes; - offset += max_bytes; - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - goto out_unlock; - -out_trans_fail: - gfs2_inplace_release(ip); -out_qunlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - - static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1562,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, - .fallocate = gfs2_fallocate, .fiemap = gfs2_fiemap, }; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 56f0da1..1ae35ba 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { error = vmtruncate(inode, attr->ia_size); if (error) - return error; + goto out_unlock; } setattr_copy(inode, attr); diff --git a/fs/internal.h b/fs/internal.h index 9687c2e..0663568 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, extern void release_mounts(struct list_head *); extern void umount_tree(struct vfsmount *, int, struct list_head *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); +extern int finish_automount(struct vfsmount *, struct path *); + +extern void mnt_make_longterm(struct vfsmount *); +extern void mnt_make_shortterm(struct vfsmount *); extern void __init mnt_init(void); @@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; - struct fiemap_extent *dest = fieinfo->fi_extents_start; + struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { @@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb, static int ioctl_fiemap(struct file *filp, unsigned long arg) { struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = filp->f_path.dentry->d_inode; struct super_block *sb = inode->i_sb; @@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) if (!inode->i_op->fiemap) return -EOPNOTSUPP; - if (copy_from_user(&fiemap, (struct fiemap __user *)arg, - sizeof(struct fiemap))) + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) @@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; - fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); + fieinfo.fi_extents_start = ufiemap->fm_extents; if (fiemap.fm_extent_count != 0 && !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, @@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; - if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 85c6be2..3005ec4 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) size = sizeof(struct jffs2_eraseblock) * c->nr_blocks; #ifndef __ECOS if (jffs2_blocks_use_vmalloc(c)) - c->blocks = vmalloc(size); + c->blocks = vzalloc(size); else #endif - c->blocks = kmalloc(size, GFP_KERNEL); + c->blocks = kzalloc(size, GFP_KERNEL); if (!c->blocks) return -ENOMEM; - memset(c->blocks, 0, size); for (i=0; i<c->nr_blocks; i++) { INIT_LIST_HEAD(&c->blocks[i].list); c->blocks[i].offset = i * c->sector_size; diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index f864005..0bc6a6c 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -144,4 +144,4 @@ struct jffs2_sb_info { void *os_priv; }; -#endif /* _JFFS2_FB_SB */ +#endif /* _JFFS2_FS_SB */ diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 9b572ca..4f9cc04 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", offset, je32_to_cpu(rx.hdr_crc), crc); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK @@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat je32_to_cpu(rx.xid), xd->xid, je32_to_cpu(rx.version), xd->version); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } xd->xprefix = rx.xprefix; xd->name_len = rx.name_len; @@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum ref_offset(xd->node), xd->data_crc, crc); kfree(data); xd->flags |= JFFS2_XFLAGS_INVALID; - return EIO; + return -EIO; } xd->flags |= JFFS2_XFLAGS_HOT; @@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x if (xd->xname) return 0; if (xd->flags & JFFS2_XFLAGS_INVALID) - return EIO; + return -EIO; if (unlikely(is_xattr_datum_unchecked(c, xd))) rc = do_verify_xattr_datum(c, xd); if (!rc) @@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref if (crc != je32_to_cpu(rr.node_crc)) { JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", offset, je32_to_cpu(rr.node_crc), crc); - return EIO; + return -EIO; } if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF @@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, je32_to_cpu(rr.totlen), PAD(sizeof(rr))); - return EIO; + return -EIO; } ref->ino = je32_to_cpu(rr.ino); ref->xid = je32_to_cpu(rr.xid); @@ -368,18 +368,6 @@ void path_get(struct path *path) EXPORT_SYMBOL(path_get); /** - * path_get_long - get a long reference to a path - * @path: path to get the reference to - * - * Given a path increment the reference count to the dentry and the vfsmount. - */ -void path_get_long(struct path *path) -{ - mntget_long(path->mnt); - dget(path->dentry); -} - -/** * path_put - put a reference to a path * @path: path to put the reference to * @@ -393,18 +381,6 @@ void path_put(struct path *path) EXPORT_SYMBOL(path_put); /** - * path_put_long - put a long reference to a path - * @path: path to put the reference to - * - * Given a path decrement the reference count to the dentry and the vfsmount. - */ -void path_put_long(struct path *path) -{ - dput(path->dentry); - mntput_long(path->mnt); -} - -/** * nameidata_drop_rcu - drop this nameidata out of rcu-walk * @nd: nameidata pathwalk data to drop * Returns: 0 on success, -ECHILD on failure @@ -800,12 +776,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); - if (link->mnt != nd->path.mnt) { - path_to_nameidata(link, nd); - nd->inode = nd->path.dentry->d_inode; - dget(dentry); - } - mntget(link->mnt); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); @@ -896,54 +868,148 @@ int follow_up(struct path *path) } /* - * serialization is taken care of in namespace.c + * Perform an automount + * - return -EISDIR to tell follow_managed() to stop and return the path we + * were called with. */ -static void __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) +static int follow_automount(struct path *path, unsigned flags, + bool *need_mntput) { - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted; - mounted = __lookup_mnt(path->mnt, path->dentry, 1); - if (!mounted) - return; - path->mnt = mounted; - path->dentry = mounted->mnt_root; - nd->seq = read_seqcount_begin(&path->dentry->d_seq); - *inode = path->dentry->d_inode; + struct vfsmount *mnt; + int err; + + if (!path->dentry->d_op || !path->dentry->d_op->d_automount) + return -EREMOTE; + + /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT + * and this is the terminal part of the path. + */ + if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE)) + return -EISDIR; /* we actually want to stop here */ + + /* We want to mount if someone is trying to open/create a file of any + * type under the mountpoint, wants to traverse through the mountpoint + * or wants to open the mounted directory. + * + * We don't want to mount if someone's just doing a stat and they've + * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and + * appended a '/' to the name. + */ + if (!(flags & LOOKUP_FOLLOW) && + !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE))) + return -EISDIR; + + current->total_link_count++; + if (current->total_link_count >= 40) + return -ELOOP; + + mnt = path->dentry->d_op->d_automount(path); + if (IS_ERR(mnt)) { + /* + * The filesystem is allowed to return -EISDIR here to indicate + * it doesn't want to automount. For instance, autofs would do + * this so that its userspace daemon can mount on this dentry. + * + * However, we can only permit this if it's a terminal point in + * the path being looked up; if it wasn't then the remainder of + * the path is inaccessible and we should say so. + */ + if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE)) + return -EREMOTE; + return PTR_ERR(mnt); } -} -static int __follow_mount(struct path *path) -{ - int res = 0; - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; + if (!mnt) /* mount collision */ + return 0; + + err = finish_automount(mnt, path); + + switch (err) { + case -EBUSY: + /* Someone else made a mount here whilst we were busy */ + return 0; + case 0: dput(path->dentry); - if (res) + if (*need_mntput) mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - res = 1; + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + *need_mntput = true; + return 0; + default: + return err; } - return res; + } -static void follow_mount(struct path *path) +/* + * Handle a dentry that is managed in some way. + * - Flagged for transit management (autofs) + * - Flagged as mountpoint + * - Flagged as automount point + * + * This may only be called in refwalk mode. + * + * Serialization is taken care of in namespace.c + */ +static int follow_managed(struct path *path, unsigned flags) { - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + unsigned managed; + bool need_mntput = false; + int ret; + + /* Given that we're not holding a lock here, we retain the value in a + * local variable for each dentry as we look at it so that we don't see + * the components of that value change under us */ + while (managed = ACCESS_ONCE(path->dentry->d_flags), + managed &= DCACHE_MANAGED_DENTRY, + unlikely(managed != 0)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage(path->dentry, + false, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (mounted) { + dput(path->dentry); + if (need_mntput) + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + need_mntput = true; + continue; + } + + /* Something is mounted on this dentry in another + * namespace and/or whatever was mounted there in this + * namespace got unmounted before we managed to get the + * vfsmount_lock */ + } + + /* Handle an automount point */ + if (managed & DCACHE_NEED_AUTOMOUNT) { + ret = follow_automount(path, flags, &need_mntput); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + continue; + } + + /* We didn't change the current path point */ + break; } + return 0; } -int follow_down(struct path *path) +int follow_down_one(struct path *path) { struct vfsmount *mounted; @@ -958,13 +1024,41 @@ int follow_down(struct path *path) return 0; } +/* + * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we + * meet a managed dentry and we're not walking to "..". True is returned to + * continue, false to abort. + */ +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, + struct inode **inode, bool reverse_transit) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted; + if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && + !reverse_transit && + path->dentry->d_op->d_manage(path->dentry, false, true) < 0) + return false; + mounted = __lookup_mnt(path->mnt, path->dentry, 1); + if (!mounted) + break; + path->mnt = mounted; + path->dentry = mounted->mnt_root; + nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *inode = path->dentry->d_inode; + } + + if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) + return reverse_transit; + return true; +} + static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; set_root_rcu(nd); - while(1) { + while (1) { if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; @@ -987,12 +1081,80 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); inode = nd->path.dentry->d_inode; } - __follow_mount_rcu(nd, &nd->path, &inode); + __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; return 0; } +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + * + * Care must be taken as namespace_sem may be held (indicated by mounting_here + * being true). + */ +int follow_down(struct path *path, bool mounting_here) +{ + unsigned managed; + int ret; + + while (managed = ACCESS_ONCE(path->dentry->d_flags), + unlikely(managed & DCACHE_MANAGED_DENTRY)) { + /* Allow the filesystem to manage the transit without i_mutex + * being held. + * + * We indicate to the filesystem if someone is trying to mount + * something here. This gives autofs the chance to deny anyone + * other than its daemon the right to mount on its + * superstructure. + * + * The filesystem may sleep at this point. + */ + if (managed & DCACHE_MANAGE_TRANSIT) { + BUG_ON(!path->dentry->d_op); + BUG_ON(!path->dentry->d_op->d_manage); + ret = path->dentry->d_op->d_manage( + path->dentry, mounting_here, false); + if (ret < 0) + return ret == -EISDIR ? 0 : ret; + } + + /* Transit to a mounted filesystem. */ + if (managed & DCACHE_MOUNTED) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + continue; + } + + /* Don't handle automount points here */ + break; + } + return 0; +} + +/* + * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() + */ +static void follow_mount(struct path *path) +{ + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted = lookup_mnt(path); + if (!mounted) + break; + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); + } +} + static void follow_dotdot(struct nameidata *nd) { set_root(nd); @@ -1057,12 +1219,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; struct inode *dir; + int err; + /* * See if the low-level filesystem might want * to use its own hash.. */ if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - int err = parent->d_op->d_hash(parent, nd->inode, name); + err = parent->d_op->d_hash(parent, nd->inode, name); if (err < 0) return err; } @@ -1089,22 +1253,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, nd->seq = seq; if (dentry->d_flags & DCACHE_OP_REVALIDATE) goto need_revalidate; +done2: path->mnt = mnt; path->dentry = dentry; - __follow_mount_rcu(nd, path, inode); - } else { - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; + if (likely(__follow_mount_rcu(nd, path, inode, false))) + return 0; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + /* fallthru */ + } + dentry = __d_lookup(parent, name); + if (!dentry) + goto need_lookup; found: - if (dentry->d_flags & DCACHE_OP_REVALIDATE) - goto need_revalidate; + if (dentry->d_flags & DCACHE_OP_REVALIDATE) + goto need_revalidate; done: - path->mnt = mnt; - path->dentry = dentry; - __follow_mount(path); - *inode = path->dentry->d_inode; - } + path->mnt = mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + *inode = path->dentry->d_inode; return 0; need_lookup: @@ -1143,6 +1315,8 @@ need_revalidate: goto need_lookup; if (IS_ERR(dentry)) goto fail; + if (nd->flags & LOOKUP_RCU) + goto done2; goto done; fail: @@ -1150,17 +1324,6 @@ fail: } /* - * This is a temporary kludge to deal with "automount" symlinks; proper - * solution is to trigger them on follow_mount(), so that do_lookup() - * would DTRT. To be killed before 2.6.34-final. - */ -static inline int follow_on_final(struct inode *inode, unsigned lookup_flags) -{ - return inode && unlikely(inode->i_op->follow_link) && - ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode)); -} - -/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -1298,7 +1461,8 @@ last_component: err = do_lookup(nd, &this, &next, &inode); if (err) break; - if (follow_on_final(inode, lookup_flags)) { + if (inode && unlikely(inode->i_op->follow_link) && + (lookup_flags & LOOKUP_FOLLOW)) { if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry)) return -ECHILD; BUG_ON(inode != next.dentry->d_inode); @@ -2200,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (open_flag & O_EXCL) goto exit_dput; - if (__follow_mount(path)) { - error = -ELOOP; - if (open_flag & O_NOFOLLOW) - goto exit_dput; - } + error = follow_managed(path, nd->flags); + if (error < 0) + goto exit_dput; error = -ENOENT; if (!path->dentry->d_inode) @@ -2353,8 +2515,7 @@ reval: struct inode *linki = link.dentry->d_inode; void *cookie; error = -ELOOP; - /* S_ISDIR part is a temporary automount kludge */ - if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(linki->i_mode)) + if (!(nd.flags & LOOKUP_FOLLOW)) goto exit_dput; if (count++ == 32) goto exit_dput; @@ -3413,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = { }; EXPORT_SYMBOL(user_path_at); +EXPORT_SYMBOL(follow_down_one); EXPORT_SYMBOL(follow_down); EXPORT_SYMBOL(follow_up); EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ diff --git a/fs/namespace.c b/fs/namespace.c index 3ddfd90..7b0b953 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt) unsigned int mnt_get_count(struct vfsmount *mnt) { #ifdef CONFIG_SMP - unsigned int count = atomic_read(&mnt->mnt_longrefs); + unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { @@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) if (!mnt->mnt_pcp) goto out_free_devname; - atomic_set(&mnt->mnt_longrefs, 1); + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; @@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path) list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); } +static inline void __mnt_make_longterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_inc(&mnt->mnt_longterm); +#endif +} + +/* needs vfsmount lock for write */ +static inline void __mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + atomic_dec(&mnt->mnt_longterm); +#endif +} + /* * vfsmount lock must be held for write */ @@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) + list_for_each_entry(m, &head, mnt_list) { m->mnt_ns = n; + __mnt_make_longterm(m); + } + list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + @@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt) deactivate_super(sb); } -#ifdef CONFIG_SMP -static inline void __mntput(struct vfsmount *mnt, int longrefs) +static void mntput_no_expire(struct vfsmount *mnt) { - if (!longrefs) { put_again: - br_read_lock(vfsmount_lock); - if (likely(atomic_read(&mnt->mnt_longrefs))) { - mnt_dec_count(mnt); - br_read_unlock(vfsmount_lock); - return; - } +#ifdef CONFIG_SMP + br_read_lock(vfsmount_lock); + if (likely(atomic_read(&mnt->mnt_longterm))) { + mnt_dec_count(mnt); br_read_unlock(vfsmount_lock); - } else { - BUG_ON(!atomic_read(&mnt->mnt_longrefs)); - if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1)) - return; + return; } + br_read_unlock(vfsmount_lock); br_write_lock(vfsmount_lock); - if (!longrefs) - mnt_dec_count(mnt); - else - atomic_dec(&mnt->mnt_longrefs); + mnt_dec_count(mnt); if (mnt_get_count(mnt)) { br_write_unlock(vfsmount_lock); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - br_write_unlock(vfsmount_lock); - acct_auto_close_mnt(mnt); - goto put_again; - } - br_write_unlock(vfsmount_lock); - mntfree(mnt); -} #else -static inline void __mntput(struct vfsmount *mnt, int longrefs) -{ -put_again: mnt_dec_count(mnt); if (likely(mnt_get_count(mnt))) return; br_write_lock(vfsmount_lock); +#endif if (unlikely(mnt->mnt_pinned)) { mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; @@ -789,12 +786,6 @@ put_again: br_write_unlock(vfsmount_lock); mntfree(mnt); } -#endif - -static void mntput_no_expire(struct vfsmount *mnt) -{ - __mntput(mnt, 0); -} void mntput(struct vfsmount *mnt) { @@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt) /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ if (unlikely(mnt->mnt_expiry_mark)) mnt->mnt_expiry_mark = 0; - __mntput(mnt, 0); + mntput_no_expire(mnt); } } EXPORT_SYMBOL(mntput); @@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mntput_long(struct vfsmount *mnt) -{ -#ifdef CONFIG_SMP - if (mnt) { - /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ - if (unlikely(mnt->mnt_expiry_mark)) - mnt->mnt_expiry_mark = 0; - __mntput(mnt, 1); - } -#else - mntput(mnt); -#endif -} -EXPORT_SYMBOL(mntput_long); - -struct vfsmount *mntget_long(struct vfsmount *mnt) -{ -#ifdef CONFIG_SMP - if (mnt) - atomic_inc(&mnt->mnt_longrefs); - return mnt; -#else - return mntget(mnt); -#endif -} -EXPORT_SYMBOL(mntget_long); - void mnt_pin(struct vfsmount *mnt) { br_write_lock(vfsmount_lock); @@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head) dput(dentry); mntput(m); } - mntput_long(mnt); + mntput(mnt); } } @@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head) */ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { + LIST_HEAD(tmp_list); struct vfsmount *p; for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, kill); + list_move(&p->mnt_hash, &tmp_list); if (propagate) - propagate_umount(kill); + propagate_umount(&tmp_list); - list_for_each_entry(p, kill, mnt_hash) { + list_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + __mnt_make_shortterm(p); list_del_init(&p->mnt_child); if (p->mnt_parent != p) { p->mnt_parent->mnt_ghosts++; @@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) } change_mnt_propagation(p, MS_PRIVATE); } + list_splice(&tmp_list, kill); } static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); @@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name) return err; down_write(&namespace_sem); - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; + err = follow_down(path, true); + if (err < 0) + goto out; + err = -EINVAL; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) goto out; @@ -1904,6 +1872,8 @@ out: return err; } +static int do_add_mount(struct vfsmount *, struct path *, int); + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; + int err; if (!type) return -EINVAL; @@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); - return do_add_mount(mnt, path, mnt_flags, NULL); + err = do_add_mount(mnt, path, mnt_flags); + if (err) + mntput(mnt); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(m) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&m->mnt_expire)) { + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + list_del_init(&m->mnt_expire); + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); + } + mntput(m); + mntput(m); + return err; } /* * add a mount into a namespace's mount tree - * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct path *path, - int mnt_flags, struct list_head *fslist) +static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags) { int err; @@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, down_write(&namespace_sem); /* Something was mounted here while we slept */ - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; + err = follow_down(path, true); + if (err < 0) + goto unlock; + err = -EINVAL; if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) goto unlock; @@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, goto unlock; newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, path))) - goto unlock; - - if (fslist) /* add to the specified expiration list */ - list_add_tail(&newmnt->mnt_expire, fslist); - - up_write(&namespace_sem); - return 0; + err = graft_tree(newmnt, path); unlock: up_write(&namespace_sem); - mntput_long(newmnt); return err; } -EXPORT_SYMBOL_GPL(do_add_mount); +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + down_write(&namespace_sem); + br_write_lock(vfsmount_lock); + + list_add_tail(&mnt->mnt_expire, expiry_list); + + br_write_unlock(vfsmount_lock); + up_write(&namespace_sem); +} +EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any @@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void) return new_ns; } +void mnt_make_longterm(struct vfsmount *mnt) +{ + __mnt_make_longterm(mnt); +} + +void mnt_make_shortterm(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) + return; + br_write_lock(vfsmount_lock); + atomic_dec(&mnt->mnt_longterm); + br_write_unlock(vfsmount_lock); +#endif +} + /* * Allocate a new namespace structure and populate it with contents * copied from the namespace of the passed in task structure. @@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, q = new_ns->root; while (p) { q->mnt_ns = new_ns; + __mnt_make_longterm(q); if (fs) { if (p == fs->root.mnt) { + fs->root.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); rootmnt = p; - fs->root.mnt = mntget_long(q); } if (p == fs->pwd.mnt) { + fs->pwd.mnt = mntget(q); + __mnt_make_longterm(q); + mnt_make_shortterm(p); pwdmnt = p; - fs->pwd.mnt = mntget_long(q); } } p = next_mnt(p, mnt_ns->root); @@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, up_write(&namespace_sem); if (rootmnt) - mntput_long(rootmnt); + mntput(rootmnt); if (pwdmnt) - mntput_long(pwdmnt); + mntput(pwdmnt); return new_ns; } @@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) new_ns = alloc_mnt_ns(); if (!IS_ERR(new_ns)) { mnt->mnt_ns = new_ns; + __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &new_ns->root->mnt_list); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index df8c03a..2c3eb33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -970,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) { struct nfs_server *server = NFS_SERVER(inode); - if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) + if (IS_AUTOMOUNT(inode)) return 0; if (nd != NULL) { /* VFS wants an on-the-wire revalidation */ @@ -1173,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, }; static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) @@ -1246,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs_open_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, }; /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ce00b70..d851242 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; - set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); + inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; @@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && - !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) + !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index bfa3a34..4644f04 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -252,6 +252,7 @@ extern char *nfs_path(const char *base, const struct dentry *droot, const struct dentry *dentry, char *buffer, ssize_t buflen); +extern struct vfsmount *nfs_d_automount(struct path *path); /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 74aaf39..f32b860 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -97,9 +97,8 @@ Elong: } /* - * nfs_follow_mountpoint - handle crossing a mountpoint on the server - * @dentry - dentry of mountpoint - * @nd - nameidata info + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint * * When we encounter a mountpoint on the server, we want to set up * a mountpoint on the client too, to prevent inode numbers from @@ -109,87 +108,65 @@ Elong: * situation, and that different filesystems may want to use * different security flavours. */ -static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); struct dentry *parent; struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; int err; - dprintk("--> nfs_follow_mountpoint()\n"); + dprintk("--> nfs_d_automount()\n"); - err = -ESTALE; - if (IS_ROOT(dentry)) - goto out_err; + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; - err = -ENOMEM; + mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr(); if (fh == NULL || fattr == NULL) - goto out_err; + goto out; dprintk("%s: enter\n", __func__); - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); - /* Look it up again */ - parent = dget_parent(nd->path.dentry); + /* Look it up again to get its attributes */ + parent = dget_parent(path->dentry); err = server->nfs_client->rpc_ops->lookup(parent->d_inode, - &nd->path.dentry->d_name, + &path->dentry->d_name, fh, fattr); dput(parent); - if (err != 0) - goto out_err; + if (err != 0) { + mnt = ERR_PTR(err); + goto out; + } if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) - mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); + mnt = nfs_do_refmount(path->mnt, path->dentry); else - mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, - fattr); - err = PTR_ERR(mnt); + mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr); if (IS_ERR(mnt)) - goto out_err; + goto out; - mntget(mnt); - err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, - &nfs_automount_list); - if (err < 0) { - mntput(mnt); - if (err == -EBUSY) - goto out_follow; - goto out_err; - } - path_put(&nd->path); - nd->path.mnt = mnt; - nd->path.dentry = dget(mnt->mnt_root); + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); - dprintk("%s: done, returned %d\n", __func__, err); - - dprintk("<-- nfs_follow_mountpoint() = %d\n", err); - return ERR_PTR(err); -out_err: - path_put(&nd->path); - goto out; -out_follow: - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path)) - ; - err = 0; - goto out; +out_nofree: + dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); + return mnt; } const struct inode_operations nfs_mountpoint_inode_operations = { - .follow_link = nfs_follow_mountpoint, .getattr = nfs_getattr, }; const struct inode_operations nfs_referral_inode_operations = { - .follow_link = nfs_follow_mountpoint, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a3c7f70..641117f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -87,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, .dentry = dget(dentry)}; int err = 0; - while (d_mountpoint(path.dentry) && follow_down(&path)) - ; + err = follow_down(&path, false); + if (err < 0) + goto out; exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index ab152c0..77a8de5 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -1,7 +1,6 @@ config OCFS2_FS tristate "OCFS2 file system support" - depends on NET && SYSFS - select CONFIGFS_FS + depends on NET && SYSFS && CONFIGFS_FS select JBD2 select CRC32 select QUOTA diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 63e3fca..a665195 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1989,20 +1989,20 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); } -static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, +static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { + struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_space_resv sr; int change_size = 1; int cmd = OCFS2_IOC_RESVSP64; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (S_ISDIR(inode->i_mode)) - return -ENODEV; - if (mode & FALLOC_FL_KEEP_SIZE) change_size = 0; @@ -2610,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, - .fallocate = ocfs2_fallocate, .fiemap = ocfs2_fiemap, }; @@ -2642,6 +2641,7 @@ const struct file_operations ocfs2_fops = { .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, + .fallocate = ocfs2_fallocate, }; const struct file_operations ocfs2_dops = { @@ -255,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - if (!inode->i_op->fallocate) + if (!file->f_op->fallocate) return -EOPNOTSUPP; - return inode->i_op->fallocate(inode, mode, offset, len); + return file->f_op->fallocate(file, mode, offset, len); } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) @@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void) static void __exit exit_pipe_fs(void) { unregister_filesystem(&pipe_fs_type); - mntput_long(pipe_mnt); + mntput(pipe_mnt); } fs_initcall(init_pipe_fs); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index e5f63da..aa68a8a 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -29,7 +29,6 @@ config SQUASHFS config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS - default n help Saying Y here includes support for extended attributes (xattrs). Xattrs are name:value pairs associated with inodes by @@ -40,7 +39,6 @@ config SQUASHFS_XATTR config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS - default n select LZO_DECOMPRESS help Saying Y here includes support for reading Squashfs file systems @@ -53,10 +51,24 @@ config SQUASHFS_LZO If unsure, say N. +config SQUASHFS_XZ + bool "Include support for XZ compressed file systems" + depends on SQUASHFS + select XZ_DEC + help + Saying Y here includes support for reading Squashfs file systems + compressed with XZ compresssion. XZ gives better compression than + the default zlib compression, at the expense of greater CPU and + memory overhead. + + XZ is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS - default n help Saying Y here allows you to specify cache size. diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7672bac..cecf2be 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o +squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 653c030..2fb2882 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -34,7 +34,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 57314be..26b15ae 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -55,7 +55,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 24af9ce..a5940e5 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -27,7 +27,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "decompressor.h" #include "squashfs.h" @@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { }; #ifndef CONFIG_SQUASHFS_LZO -static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { +static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif +#ifndef CONFIG_SQUASHFS_XZ +static const struct squashfs_decompressor squashfs_xz_comp_ops = { + NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, - &squashfs_lzma_unsupported_comp_ops, -#ifdef CONFIG_SQUASHFS_LZO &squashfs_lzo_comp_ops, -#else - &squashfs_lzo_unsupported_comp_ops, -#endif + &squashfs_xz_comp_ops, + &squashfs_lzma_unsupported_comp_ops, &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 7425f80..3b305a7 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk, return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, length, srclength, pages); } + +#ifdef CONFIG_SQUASHFS_XZ +extern const struct squashfs_decompressor squashfs_xz_comp_ops; +#endif + +#ifdef CONFIG_SQUASHFS_LZO +extern const struct squashfs_decompressor squashfs_lzo_comp_ops; +#endif + #endif diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index 7c90bbd..7eef571 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -39,7 +39,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index b7f64bc..d8f3245 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -37,7 +37,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" /* diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 5d87789..7da759e 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -29,7 +29,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 5d45569..ba729d8 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -27,11 +27,6 @@ #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) -static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) -{ - return list_entry(inode, struct squashfs_inode_info, vfs_inode); -} - /* block.c */ extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, int, int); @@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[]; /* zlib_wrapper.c */ extern const struct squashfs_decompressor squashfs_zlib_comp_ops; - -/* lzo_wrapper.c */ -extern const struct squashfs_decompressor squashfs_lzo_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index c5137fc..39533fe 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -238,6 +238,7 @@ struct meta_index { #define ZLIB_COMPRESSION 1 #define LZMA_COMPRESSION 2 #define LZO_COMPRESSION 3 +#define XZ_COMPRESSION 4 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index d3e3a37..359baef 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -45,4 +45,10 @@ struct squashfs_inode_info { }; struct inode vfs_inode; }; + + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} #endif diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index d33be5d..05385db 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -32,7 +32,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c new file mode 100644 index 0000000..856756c --- /dev/null +++ b/fs/squashfs/xz_wrapper.c @@ -0,0 +1,153 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xz_wrapper.c + */ + + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/xz.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" + +struct squashfs_xz { + struct xz_dec *state; + struct xz_buf buf; +}; + +static void *squashfs_xz_init(struct squashfs_sb_info *msblk) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + + struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + + stream->state = xz_dec_init(XZ_PREALLOC, block_size); + if (stream->state == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate xz workspace\n"); + kfree(stream); + return NULL; +} + + +static void squashfs_xz_free(void *strm) +{ + struct squashfs_xz *stream = strm; + + if (stream) { + xz_dec_end(stream->state); + kfree(stream); + } +} + + +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + enum xz_ret xz_err; + int avail, total = 0, k = 0, page = 0; + struct squashfs_xz *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + xz_dec_reset(stream->state); + stream->buf.in_pos = 0; + stream->buf.in_size = 0; + stream->buf.out_pos = 0; + stream->buf.out_size = PAGE_CACHE_SIZE; + stream->buf.out = buffer[page++]; + + do { + if (stream->buf.in_pos == stream->buf.in_size && k < b) { + avail = min(length, msblk->devblksize - offset); + length -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + stream->buf.in = bh[k]->b_data + offset; + stream->buf.in_size = avail; + stream->buf.in_pos = 0; + offset = 0; + } + + if (stream->buf.out_pos == stream->buf.out_size + && page < pages) { + stream->buf.out = buffer[page++]; + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } + + xz_err = xz_dec_run(stream->state, &stream->buf); + + if (stream->buf.in_pos == stream->buf.in_size && k < b) + put_bh(bh[k++]); + } while (xz_err == XZ_OK); + + if (xz_err != XZ_STREAM_END) { + ERROR("xz_dec_run error, data probably corrupt\n"); + goto release_mutex; + } + + if (k < b) { + ERROR("xz_uncompress error, input remaining\n"); + goto release_mutex; + } + + total += stream->buf.out_pos; + mutex_unlock(&msblk->read_data_mutex); + return total; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_xz_comp_ops = { + .init = squashfs_xz_init, + .free = squashfs_xz_free, + .decompress = squashfs_xz_uncompress, + .id = XZ_COMPRESSION, + .name = "xz", + .supported = 1 +}; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 7a60387..818a5e0 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -29,7 +29,6 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" -#include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" @@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, struct buffer_head **bh, int b, int offset, int length, int srclength, int pages) { - int zlib_err = 0, zlib_init = 0; - int avail, bytes, k = 0, page = 0; + int zlib_err, zlib_init = 0; + int k = 0, page = 0; z_stream *stream = msblk->stream; mutex_lock(&msblk->read_data_mutex); @@ -75,11 +74,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, stream->avail_out = 0; stream->avail_in = 0; - bytes = length; do { if (stream->avail_in == 0 && k < b) { - avail = min(bytes, msblk->devblksize - offset); - bytes -= avail; + int avail = min(length, msblk->devblksize - offset); + length -= avail; wait_on_buffer(bh[k]); if (!buffer_uptodate(bh[k])) goto release_mutex; @@ -128,6 +126,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto release_mutex; } + if (k < b) { + ERROR("zlib_uncompress error, data remaining\n"); + goto release_mutex; + } + length = stream->total_out; mutex_unlock(&msblk->read_data_mutex); return length; @@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flag & AT_NO_AUTOMOUNT) + lookup_flags |= LOOKUP_NO_AUTOMOUNT; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -1141,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return mnt; err: - mntput_long(mnt); + mntput(mnt); return ERR_PTR(err); } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ef51eb43e..a55c1b4 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_trace.h" #include <linux/dcache.h> +#include <linux/falloc.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -882,6 +883,60 @@ out_unlock: return ret; } +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + long error; + loff_t new_size = 0; + xfs_flock64_t bf; + xfs_inode_t *ip = XFS_I(inode); + int cmd = XFS_IOC_RESVSP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + bf.l_whence = 0; + bf.l_start = offset; + bf.l_len = len; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (mode & FALLOC_FL_PUNCH_HOLE) + cmd = XFS_IOC_UNRESVSP; + + /* check the new inode size is valid before allocating */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); + } + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + + STATIC int xfs_file_open( struct inode *inode, @@ -1000,6 +1055,7 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index da54403..bd57278 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -46,7 +46,6 @@ #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/falloc.h> #include <linux/fiemap.h> #include <linux/slab.h> @@ -505,61 +504,6 @@ xfs_vn_setattr( return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } -STATIC long -xfs_vn_fallocate( - struct inode *inode, - int mode, - loff_t offset, - loff_t len) -{ - long error; - loff_t new_size = 0; - xfs_flock64_t bf; - xfs_inode_t *ip = XFS_I(inode); - int cmd = XFS_IOC_RESVSP; - - /* preallocation on directories not yet supported */ - error = -ENODEV; - if (S_ISDIR(inode->i_mode)) - goto out_error; - - bf.l_whence = 0; - bf.l_start = offset; - bf.l_len = len; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (mode & FALLOC_FL_PUNCH_HOLE) - cmd = XFS_IOC_UNRESVSP; - - /* check the new inode size is valid before allocating */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); - if (error) - goto out_unlock; - - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); - } - -out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); -out_error: - return error; -} - #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) /* @@ -653,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = { .getxattr = generic_getxattr, .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, - .fallocate = xfs_vn_fallocate, .fiemap = xfs_vn_fiemap, }; diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index e6cf955..0df8889 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -75,11 +75,11 @@ xfs_cmn_err( { struct va_format vaf; va_list args; - int panic = 0; + int do_panic = 0; if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { printk(KERN_ALERT "XFS: Transforming an alert into a BUG."); - panic = 1; + do_panic = 1; } va_start(args, fmt); @@ -89,7 +89,7 @@ xfs_cmn_err( printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf); va_end(args); - BUG_ON(panic); + BUG_ON(do_panic); } void |