aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/user.c88
-rw-r--r--fs/fcntl.c66
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c17
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c101
-rw-r--r--fs/gfs2/inode.h5
-rw-r--r--fs/gfs2/log.c158
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c19
-rw-r--r--fs/gfs2/quota.c102
-rw-r--r--fs/gfs2/rgrp.c68
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/gfs2/sys.c6
-rw-r--r--fs/gfs2/trans.c18
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/background.c3
-rw-r--r--fs/jffs2/erase.c12
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.h10
-rw-r--r--fs/jffs2/nodemgmt.c28
-rw-r--r--fs/jffs2/os-linux.h3
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/wbuf.c8
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/libfs.c35
-rw-r--r--fs/logfs/dir.c2
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/logfs/logfs_abi.h10
-rw-r--r--fs/namespace.c13
-rw-r--r--fs/nfs/client.c55
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c143
-rw-r--r--fs/nfs/file.c15
-rw-r--r--fs/nfs/fscache.c3
-rw-r--r--fs/nfs/getroot.c191
-rw-r--r--fs/nfs/inode.c58
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/iostat.h6
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs3acl.c23
-rw-r--r--fs/nfs/nfs3proc.c128
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c172
-rw-r--r--fs/nfs/nfs4state.c36
-rw-r--r--fs/nfs/nfs4xdr.c22
-rw-r--r--fs/nfs/nfsroot.c14
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/proc.c144
-rw-r--r--fs/nfs/read.c4
-rw-r--r--fs/nfs/super.c147
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfsd/export.c44
-rw-r--r--fs/nfsd/nfs4callback.c140
-rw-r--r--fs/nfsd/nfs4proc.c50
-rw-r--r--fs/nfsd/nfs4state.c376
-rw-r--r--fs/nfsd/nfs4xdr.c27
-rw-r--r--fs/nfsd/nfsctl.c64
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h47
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nfsd/vfs.h1
-rw-r--r--fs/nfsd/xdr4.h11
-rw-r--r--fs/nilfs2/alloc.c154
-rw-r--r--fs/nilfs2/alloc.h7
-rw-r--r--fs/nilfs2/btree.c91
-rw-r--r--fs/nilfs2/btree.h23
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/segbuf.c70
-rw-r--r--fs/nilfs2/segbuf.h10
-rw-r--r--fs/nilfs2/segment.c157
-rw-r--r--fs/nilfs2/segment.h6
-rw-r--r--fs/nilfs2/super.c218
-rw-r--r--fs/nilfs2/the_nilfs.c10
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c908
-rw-r--r--fs/ocfs2/alloc.h12
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/dir.c75
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c28
-rw-r--r--fs/ocfs2/dlm/dlmlock.c6
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c27
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c3
-rw-r--r--fs/ocfs2/file.c215
-rw-r--r--fs/ocfs2/inode.c45
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/journal.h15
-rw-r--r--fs/ocfs2/localalloc.c275
-rw-r--r--fs/ocfs2/localalloc.h3
-rw-r--r--fs/ocfs2/mmap.c48
-rw-r--r--fs/ocfs2/namei.c91
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_fs.h144
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/quota_local.c50
-rw-r--r--fs/ocfs2/refcounttree.c74
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c847
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c19
-rw-r--r--fs/ocfs2/suballoc.c688
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/super.c88
-rw-r--r--fs/ocfs2/super.h7
-rw-r--r--fs/ocfs2/xattr.c103
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/proc/base.c10
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/sysfs/bin.c26
-rw-r--r--fs/sysfs/dir.c114
-rw-r--r--fs/sysfs/file.c17
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysfs/inode.c6
-rw-r--r--fs/sysfs/mount.c95
-rw-r--r--fs/sysfs/symlink.c36
-rw-r--r--fs/sysfs/sysfs.h34
-rw-r--r--fs/timerfd.c25
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c231
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c91
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h150
-rw-r--r--fs/xfs/quota/xfs_dquot.c193
-rw-r--r--fs/xfs/quota/xfs_dquot.h35
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c30
-rw-r--r--fs/xfs/quota/xfs_qm.c609
-rw-r--r--fs/xfs/quota/xfs_qm.h23
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c152
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h102
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c29
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_buf_item.c55
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_error.c30
-rw-r--r--fs/xfs/xfs_error.h9
-rw-r--r--fs/xfs/xfs_extfree_item.c18
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode_item.c21
-rw-r--r--fs/xfs/xfs_iomap.c123
-rw-r--r--fs/xfs/xfs_iomap.h47
-rw-r--r--fs/xfs/xfs_log.c702
-rw-r--r--fs/xfs/xfs_log.h13
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c311
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_trans.c760
-rw-r--r--fs/xfs/xfs_trans.h14
-rw-r--r--fs/xfs/xfs_trans_buf.c187
184 files changed, 6992 insertions, 5111 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 17903b4..031dbe3 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
if (lkb->lkb_rqmode < mode)
break;
- if (!lkb)
- list_add_tail(new, head);
- else
- __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
}
/* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 8b6e73c..b627285 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
if (!ast_type) {
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+ lkb->lkb_ast_first = type;
wake_up_interruptible(&proc->wait);
}
if (type == AST_COMP && (ast_type & AST_COMP))
@@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
if (eol) {
- lkb->lkb_ast_type &= ~AST_BAST;
lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
}
@@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
}
static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
- int bmode, char __user *buf, size_t count)
+ int mode, char __user *buf, size_t count)
{
#ifdef CONFIG_COMPAT
struct dlm_lock_result32 result32;
@@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
if (type == AST_BAST) {
result.user_astaddr = ua->bastaddr;
result.user_astparam = ua->bastparam;
- result.bast_mode = bmode;
+ result.bast_mode = mode;
} else {
result.user_astaddr = ua->castaddr;
result.user_astparam = ua->castparam;
@@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_user_proc *proc = file->private_data;
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
- int error, type=0, bmode=0, removed = 0;
+ int error = 0, removed;
+ int ret_type, ret_mode;
+ int bastmode, castmode, do_bast, do_cast;
if (count == sizeof(struct dlm_device_version)) {
error = copy_version_to_user(buf, count);
@@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
#endif
return -EINVAL;
+ try_another:
+
/* do we really need this? can a read happen after a close? */
if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
return -EINVAL;
@@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
- if (lkb->lkb_ast_type & AST_COMP) {
- lkb->lkb_ast_type &= ~AST_COMP;
- type = AST_COMP;
- } else if (lkb->lkb_ast_type & AST_BAST) {
- lkb->lkb_ast_type &= ~AST_BAST;
- type = AST_BAST;
- bmode = lkb->lkb_bastmode;
+ removed = 0;
+ ret_type = 0;
+ ret_mode = 0;
+ do_bast = lkb->lkb_ast_type & AST_BAST;
+ do_cast = lkb->lkb_ast_type & AST_COMP;
+ bastmode = lkb->lkb_bastmode;
+ castmode = lkb->lkb_castmode;
+
+ /* when both are queued figure out which to do first and
+ switch first so the other goes in the next read */
+
+ if (do_cast && do_bast) {
+ if (lkb->lkb_ast_first == AST_COMP) {
+ ret_type = AST_COMP;
+ ret_mode = castmode;
+ lkb->lkb_ast_type &= ~AST_COMP;
+ lkb->lkb_ast_first = AST_BAST;
+ } else {
+ ret_type = AST_BAST;
+ ret_mode = bastmode;
+ lkb->lkb_ast_type &= ~AST_BAST;
+ lkb->lkb_ast_first = AST_COMP;
+ }
+ } else {
+ ret_type = lkb->lkb_ast_first;
+ ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+ lkb->lkb_ast_type &= ~ret_type;
+ lkb->lkb_ast_first = 0;
+ }
+
+ /* if we're doing a bast but the bast is unnecessary, then
+ switch to do nothing or do a cast if that was needed next */
+
+ if ((ret_type == AST_BAST) &&
+ dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+ ret_type = 0;
+ ret_mode = 0;
+
+ if (do_cast) {
+ ret_type = AST_COMP;
+ ret_mode = castmode;
+ lkb->lkb_ast_type &= ~AST_COMP;
+ lkb->lkb_ast_first = 0;
+ }
+ }
+
+ if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+ log_print("device_read %x ast_first %x ast_type %x",
+ lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
}
if (!lkb->lkb_ast_type) {
@@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
}
spin_unlock(&proc->asts_spin);
- error = copy_result_to_user(lkb->lkb_ua,
- test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- type, bmode, buf, count);
+ if (ret_type) {
+ error = copy_result_to_user(lkb->lkb_ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ ret_type, ret_mode, buf, count);
+
+ if (ret_type == AST_COMP)
+ lkb->lkb_castmode_done = castmode;
+ if (ret_type == AST_BAST)
+ lkb->lkb_bastmode_done = bastmode;
+ }
/* removes reference for the proc->asts lists added by
dlm_user_add_ast() and may result in the lkb being freed */
+
if (removed)
dlm_put_lkb(lkb);
+ /* the bast that was queued was eliminated (see unnecessary above),
+ leaving nothing to return */
+
+ if (!ret_type)
+ goto try_another;
+
return error;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f..0a14074 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown)
return ret;
}
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(fasync_cache,
+ container_of(head, struct fasync_struct, fa_rcu));
+}
+
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +631,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
* NOTE! It is very important that the FASYNC flag always
* match the state "is the filp on a fasync list".
*
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
*/
static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
@@ -634,17 +638,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
int result = 0;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
+
+ spin_lock_irq(&fa->fa_lock);
+ fa->fa_file = NULL;
+ spin_unlock_irq(&fa->fa_lock);
+
*fp = fa->fa_next;
- kmem_cache_free(fasync_cache, fa);
+ call_rcu(&fa->fa_rcu, fasync_free_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
}
- write_unlock_irq(&fasync_lock);
+ spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -666,25 +675,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
return -ENOMEM;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
+
+ spin_lock_irq(&fa->fa_lock);
fa->fa_fd = fd;
+ spin_unlock_irq(&fa->fa_lock);
+
kmem_cache_free(fasync_cache, new);
goto out;
}
+ spin_lock_init(&new->fa_lock);
new->magic = FASYNC_MAGIC;
new->fa_file = filp;
new->fa_fd = fd;
new->fa_next = *fapp;
- *fapp = new;
+ rcu_assign_pointer(*fapp, new);
result = 1;
filp->f_flags |= FASYNC;
out:
- write_unlock_irq(&fasync_lock);
+ spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -704,37 +718,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
- struct fown_struct * fown;
+ struct fown_struct *fown;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- fown = &fa->fa_file->f_owner;
- /* Don't send SIGURG to processes which have not set a
- queued signum: SIGURG has its own default signalling
- mechanism. */
- if (!(sig == SIGURG && fown->signum == 0))
- send_sigio(fown, fa->fa_fd, band);
- fa = fa->fa_next;
+ spin_lock(&fa->fa_lock);
+ if (fa->fa_file) {
+ fown = &fa->fa_file->f_owner;
+ /* Don't send SIGURG to processes which have not set a
+ queued signum: SIGURG has its own default signalling
+ mechanism. */
+ if (!(sig == SIGURG && fown->signum == 0))
+ send_sigio(fown, fa->fa_fd, band);
+ }
+ spin_unlock(&fa->fa_lock);
+ fa = rcu_dereference(fa->fa_next);
}
}
-EXPORT_SYMBOL(__kill_fasync);
-
void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
/* First a quick test without locking: usually
* the list is empty.
*/
if (*fp) {
- read_lock(&fasync_lock);
- /* reread *fp after obtaining the lock */
- __kill_fasync(*fp, sig, band);
- read_unlock(&fasync_lock);
+ rcu_read_lock();
+ kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b8..a739a0a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
{
struct buffer_head *dibh;
+ u64 dsize = i_size_read(&ip->i_inode);
void *kaddr;
int error;
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return error;
kaddr = kmap_atomic(page, KM_USER0);
- memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
- ip->i_disksize);
- memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+ if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+ dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
+ memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+ memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(page);
brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e411d5..4a48c0f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
if (!PageUptodate(page)) {
void *kaddr = kmap(page);
+ u64 dsize = i_size_read(inode);
+
+ if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+ dsize = dibh->b_size - sizeof(struct gfs2_dinode);
- memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
- ip->i_disksize);
- memset(kaddr + ip->i_disksize, 0,
- PAGE_CACHE_SIZE - ip->i_disksize);
+ memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+ memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
kunmap(page);
SetPageUptodate(page);
@@ -1038,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
goto out;
if (gfs2_is_stuffed(ip)) {
- ip->i_disksize = size;
+ u64 dsize = size + sizeof(struct gfs2_inode);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
- gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+ if (dsize > dibh->b_size)
+ dsize = dibh->b_size;
+ gfs2_buffer_clear_tail(dibh, dsize);
error = 1;
-
} else {
if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc1..8295c5b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
inode = gfs2_inode_lookup(dir->i_sb,
be16_to_cpu(dent->de_type),
be64_to_cpu(dent->de_inum.no_addr),
- be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+ be64_to_cpu(dent->de_inum.no_formal_ino));
brelse(bh);
return inode;
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c22c211..dfe237a 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
if (error)
goto fail;
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto fail;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4..ddcdbf4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
gh->gh_flags = flags;
gh->gh_iflags = 0;
gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ if (gh->gh_owner_pid)
+ put_pid(gh->gh_owner_pid);
+ gh->gh_owner_pid = get_pid(task_pid(current));
}
/**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f..b5d7363 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
struct gfs2_tune {
spinlock_t gt_spin;
- unsigned int gt_incore_log_blocks;
- unsigned int gt_log_flush_secs;
-
unsigned int gt_logd_secs;
unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
SDF_SHUTDOWN = 2,
SDF_NOBARRIERS = 3,
SDF_NORECOVERY = 4,
+ SDF_DEMOTE = 5,
};
#define GFS2_FSNAME_LEN 256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
unsigned int sd_log_commited_databuf;
int sd_log_commited_revoke;
+ atomic_t sd_log_pinned;
unsigned int sd_log_num_buf;
unsigned int sd_log_num_revoke;
unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
+ atomic_t sd_log_thresh1;
+ atomic_t sd_log_thresh2;
atomic_t sd_log_blks_free;
- struct mutex sd_log_reserve_mutex;
+ wait_queue_head_t sd_log_waitq;
+ wait_queue_head_t sd_logd_waitq;
u64 sd_log_sequence;
unsigned int sd_log_head;
unsigned int sd_log_tail;
int sd_log_idle;
- unsigned long sd_log_flush_time;
struct rw_semaphore sd_log_flush_lock;
atomic_t sd_log_in_flight;
wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf269..51d8061 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
* @sb: The super block
* @no_addr: The inode number
* @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
*
* Returns: A VFS inode, or an error
*/
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
struct inode *gfs2_inode_lookup(struct super_block *sb,
unsigned int type,
u64 no_addr,
- u64 no_formal_ino, int skip_freeing)
+ u64 no_formal_ino)
{
struct inode *inode;
struct gfs2_inode *ip;
struct gfs2_glock *io_gl;
int error;
- if (skip_freeing)
- inode = gfs2_iget_skip(sb, no_addr);
- else
- inode = gfs2_iget(sb, no_addr);
+ inode = gfs2_iget(sb, no_addr);
ip = GFS2_I(inode);
if (!inode)
@@ -234,13 +230,100 @@ fail_glock:
fail_iopen:
gfs2_glock_put(io_gl);
fail_put:
- ip->i_gl->gl_object = NULL;
+ if (inode->i_state & I_NEW)
+ ip->i_gl->gl_object = NULL;
gfs2_glock_put(ip->i_gl);
fail:
- iget_failed(inode);
+ if (inode->i_state & I_NEW)
+ iget_failed(inode);
+ else
+ iput(inode);
return ERR_PTR(error);
}
+/**
+ * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation
+ * @sb: The super block
+ * no_addr: The inode number
+ * @@inode: A pointer to the inode found, if any
+ *
+ * Returns: 0 and *inode if no errors occurred. If an error occurs,
+ * the resulting *inode may or may not be NULL.
+ */
+
+int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+ struct inode **inode)
+{
+ struct gfs2_sbd *sdp;
+ struct gfs2_inode *ip;
+ struct gfs2_glock *io_gl;
+ int error;
+ struct gfs2_holder gh;
+
+ *inode = gfs2_iget_skip(sb, no_addr);
+
+ if (!(*inode))
+ return -ENOBUFS;
+
+ if (!((*inode)->i_state & I_NEW))
+ return -ENOBUFS;
+
+ ip = GFS2_I(*inode);
+ sdp = GFS2_SB(*inode);
+ ip->i_no_formal_ino = -1;
+
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+ if (unlikely(error))
+ goto fail;
+ ip->i_gl->gl_object = ip;
+
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+ if (unlikely(error))
+ goto fail_put;
+
+ set_bit(GIF_INVALID, &ip->i_flags);
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+ &ip->i_iopen_gh);
+ if (unlikely(error)) {
+ if (error == GLR_TRYFAILED)
+ error = 0;
+ goto fail_iopen;
+ }
+ ip->i_iopen_gh.gh_gl->gl_object = ip;
+ gfs2_glock_put(io_gl);
+
+ (*inode)->i_mode = DT2IF(DT_UNKNOWN);
+
+ /*
+ * We must read the inode in order to work out its type in
+ * this case. Note that this doesn't happen often as we normally
+ * know the type beforehand. This code path only occurs during
+ * unlinked inode recovery (where it is safe to do this glock,
+ * which is not true in the general case).
+ */
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+ &gh);
+ if (unlikely(error)) {
+ if (error == GLR_TRYFAILED)
+ error = 0;
+ goto fail_glock;
+ }
+ /* Inode is now uptodate */
+ gfs2_glock_dq_uninit(&gh);
+ gfs2_set_iop(*inode);
+
+ return 0;
+fail_glock:
+ gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+ gfs2_glock_put(io_gl);
+fail_put:
+ ip->i_gl->gl_object = NULL;
+ gfs2_glock_put(ip->i_gl);
+fail:
+ return error;
+}
+
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
const struct gfs2_dinode *str = buf;
@@ -862,7 +945,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
goto fail_gunlock2;
inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
- inum.no_formal_ino, 0);
+ inum.no_formal_ino);
if (IS_ERR(inode))
goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf..e161461 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,9 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
extern void gfs2_set_iop(struct inode *inode);
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino,
- int skip_freeing);
+ u64 no_addr, u64 no_formal_ino);
+extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+ struct inode **inode);
extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b5..b593f0e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
return list_empty(&ai->ai_ail1_list);
}
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
{
struct list_head *head;
u64 sync_gen;
- struct list_head *first;
- struct gfs2_ail *first_ai, *ai, *tmp;
+ struct gfs2_ail *ai;
int done = 0;
gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
}
sync_gen = sdp->sd_ail_sync_gen++;
- first = head->prev;
- first_ai = list_entry(first, struct gfs2_ail, ai_list);
- first_ai->ai_sync_gen = sync_gen;
- gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-
- if (flags & DIO_ALL)
- first = NULL;
-
while(!done) {
- if (first && (head->prev != first ||
- gfs2_ail1_empty_one(sdp, first_ai, 0)))
- break;
-
done = 1;
- list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+ list_for_each_entry_reverse(ai, head, ai_list) {
if (ai->ai_sync_gen >= sync_gen)
continue;
ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
* flush time, so we ensure that we have just enough free blocks at all
* times to avoid running out during a log flush.
*
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
* Returns: errno
*/
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
- unsigned int try = 0;
unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+ unsigned wanted = blks + reserved_blks;
+ DEFINE_WAIT(wait);
+ int did_wait = 0;
+ unsigned int free_blocks;
if (gfs2_assert_warn(sdp, blks) ||
gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
return -EINVAL;
-
- mutex_lock(&sdp->sd_log_reserve_mutex);
- gfs2_log_lock(sdp);
- while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
- gfs2_log_unlock(sdp);
- gfs2_ail1_empty(sdp, 0);
- gfs2_log_flush(sdp, NULL);
-
- if (try++)
- gfs2_ail1_start(sdp, 0);
- gfs2_log_lock(sdp);
+retry:
+ free_blocks = atomic_read(&sdp->sd_log_blks_free);
+ if (unlikely(free_blocks <= wanted)) {
+ do {
+ prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ wake_up(&sdp->sd_logd_waitq);
+ did_wait = 1;
+ if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
+ io_schedule();
+ free_blocks = atomic_read(&sdp->sd_log_blks_free);
+ } while(free_blocks <= wanted);
+ finish_wait(&sdp->sd_log_waitq, &wait);
}
- atomic_sub(blks, &sdp->sd_log_blks_free);
+ if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+ free_blocks - blks) != free_blocks)
+ goto retry;
trace_gfs2_log_blocks(sdp, -blks);
- gfs2_log_unlock(sdp);
- mutex_unlock(&sdp->sd_log_reserve_mutex);
+
+ /*
+ * If we waited, then so might others, wake them up _after_ we get
+ * our share of the log.
+ */
+ if (unlikely(did_wait))
+ wake_up(&sdp->sd_log_waitq);
down_read(&sdp->sd_log_flush_lock);
return 0;
}
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-
- gfs2_log_lock(sdp);
- atomic_add(blks, &sdp->sd_log_blks_free);
- trace_gfs2_log_blocks(sdp, blks);
- gfs2_assert_withdraw(sdp,
- atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
- gfs2_log_unlock(sdp);
- up_read(&sdp->sd_log_flush_lock);
-}
-
static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
{
struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
ail2_empty(sdp, new_tail);
- gfs2_log_lock(sdp);
atomic_add(dist, &sdp->sd_log_blks_free);
trace_gfs2_log_blocks(sdp, dist);
- gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
- gfs2_log_unlock(sdp);
+ gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+ sdp->sd_jdesc->jd_blocks);
sdp->sd_log_tail = new_tail;
}
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
set_buffer_uptodate(bh);
+ fs_info(sdp, "barrier sync failed - disabling barriers\n");
set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
lock_buffer(bh);
skip_barrier:
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
* @sdp: the filesystem
* @tr: the transaction
*
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
* Returns: errno
*/
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
up_read(&sdp->sd_log_flush_lock);
- gfs2_log_lock(sdp);
- if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
- wake_up_process(sdp->sd_logd_process);
- gfs2_log_unlock(sdp);
+ if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
+ ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
+ atomic_read(&sdp->sd_log_thresh2)))
+ wake_up(&sdp->sd_logd_waitq);
}
/**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
{
gfs2_log_flush(sdp, NULL);
for (;;) {
- gfs2_ail1_start(sdp, DIO_ALL);
+ gfs2_ail1_start(sdp);
if (gfs2_ail1_empty(sdp, DIO_ALL))
break;
msleep(10);
}
}
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+ return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+ unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+ return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
/**
* gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
int gfs2_logd(void *data)
{
struct gfs2_sbd *sdp = data;
- unsigned long t;
- int need_flush;
+ unsigned long t = 1;
+ DEFINE_WAIT(wait);
+ unsigned preflush;
while (!kthread_should_stop()) {
- /* Advance the log tail */
- t = sdp->sd_log_flush_time +
- gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+ preflush = atomic_read(&sdp->sd_log_pinned);
+ if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+ gfs2_ail1_empty(sdp, DIO_ALL);
+ gfs2_log_flush(sdp, NULL);
+ gfs2_ail1_empty(sdp, DIO_ALL);
+ }
- gfs2_ail1_empty(sdp, DIO_ALL);
- gfs2_log_lock(sdp);
- need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
- gfs2_log_unlock(sdp);
- if (need_flush || time_after_eq(jiffies, t)) {
+ if (gfs2_ail_flush_reqd(sdp)) {
+ gfs2_ail1_start(sdp);
+ io_schedule();
+ gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL);
- sdp->sd_log_flush_time = jiffies;
+ gfs2_ail1_empty(sdp, DIO_ALL);
}
+ wake_up(&sdp->sd_log_waitq);
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
if (freezing(current))
refrigerator();
- schedule_timeout_interruptible(t);
+
+ do {
+ prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!gfs2_ail_flush_reqd(sdp) &&
+ !gfs2_jrnl_flush_reqd(sdp) &&
+ !kthread_should_stop())
+ t = schedule_timeout(t);
+ } while(t && !gfs2_ail_flush_reqd(sdp) &&
+ !gfs2_jrnl_flush_reqd(sdp) &&
+ !kthread_should_stop());
+ finish_wait(&sdp->sd_logd_waitq, &wait);
}
return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510..eb570b4 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
void gfs2_log_incr_head(struct gfs2_sbd *sdp);
struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260f..bf33f82 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
if (bd->bd_ail)
list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
get_bh(bh);
+ atomic_inc(&sdp->sd_log_pinned);
trace_gfs2_pin(bd, 1);
}
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
trace_gfs2_pin(bd, 0);
gfs2_log_unlock(sdp);
unlock_buffer(bh);
+ atomic_dec(&sdp->sd_log_pinned);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc..fb2a5f9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_glock_cachep)
goto fail;
- gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+ gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
sizeof(struct gfs2_glock) +
sizeof(struct address_space),
0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c8..18176d0 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
{
- int err;
struct buffer_head *bh, *head;
int nr_underway = 0;
int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
} while (bh != head);
unlock_page(page);
- err = 0;
if (nr_underway == 0)
end_page_writeback(page);
- return err;
+ return 0;
}
const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
struct gfs2_bufdata *bd = bh->b_private;
if (test_clear_buffer_pinned(bh)) {
+ atomic_dec(&sdp->sd_log_pinned);
list_del_init(&bd->bd_le.le_list);
if (meta) {
gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed..3593b3a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
{
spin_lock_init(&gt->gt_spin);
- gt->gt_incore_log_blocks = 1024;
- gt->gt_logd_secs = 1;
gt->gt_quota_simul_sync = 64;
gt->gt_quota_warn_period = 10;
gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_log_lock);
-
+ atomic_set(&sdp->sd_log_pinned, 0);
INIT_LIST_HEAD(&sdp->sd_log_le_buf);
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
INIT_LIST_HEAD(&sdp->sd_log_le_rg);
INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
- mutex_init(&sdp->sd_log_reserve_mutex);
+ init_waitqueue_head(&sdp->sd_log_waitq);
+ init_waitqueue_head(&sdp->sd_logd_waitq);
INIT_LIST_HEAD(&sdp->sd_ail1_list);
INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
struct dentry *dentry;
struct inode *inode;
- inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
if (IS_ERR(inode)) {
fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
if (sdp->sd_args.ar_spectator) {
sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+ atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+ atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
} else {
if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
goto fail_jinode_gh;
}
atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+ atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+ atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
/* Map the extents for this journal's blocks */
map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_quotad;
- sdp->sd_log_flush_time = jiffies;
-
p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
error = IS_ERR(p);
if (error) {
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
GFS2_BASIC_BLOCK_SHIFT;
sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
- sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+ sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
if (sdp->sd_args.ar_statfs_quantum) {
sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
memset(&args, 0, sizeof(args));
args.ar_quota = GFS2_QUOTA_DEFAULT;
args.ar_data = GFS2_DATA_DEFAULT;
- args.ar_commit = 60;
+ args.ar_commit = 30;
args.ar_statfs_quantum = 30;
args.ar_quota_quantum = 60;
args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad..d5f4661 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
unsigned blocksize, iblock, pos;
struct buffer_head *bh, *dibh;
struct page *page;
- void *kaddr;
- struct gfs2_quota *qp;
- s64 value;
- int err = -EIO;
+ void *kaddr, *ptr;
+ struct gfs2_quota q, *qp;
+ int err, nbytes;
u64 size;
if (gfs2_is_stuffed(ip))
gfs2_unstuff_dinode(ip, NULL);
-
+
+ memset(&q, 0, sizeof(struct gfs2_quota));
+ err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+ if (err < 0)
+ return err;
+
+ err = -EIO;
+ qp = &q;
+ qp->qu_value = be64_to_cpu(qp->qu_value);
+ qp->qu_value += change;
+ qp->qu_value = cpu_to_be64(qp->qu_value);
+ qd->qd_qb.qb_value = qp->qu_value;
+ if (fdq) {
+ if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+ qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+ qd->qd_qb.qb_warn = qp->qu_warn;
+ }
+ if (fdq->d_fieldmask & FS_DQ_BHARD) {
+ qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+ qd->qd_qb.qb_limit = qp->qu_limit;
+ }
+ }
+
+ /* Write the quota into the quota file on disk */
+ ptr = qp;
+ nbytes = sizeof(struct gfs2_quota);
+get_a_page:
page = grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
if (!buffer_mapped(bh)) {
gfs2_block_map(inode, iblock, bh, 1);
if (!buffer_mapped(bh))
- goto unlock;
+ goto unlock_out;
+ /* If it's a newly allocated disk block for quota, zero it */
+ if (buffer_new(bh)) {
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ }
}
if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
ll_rw_block(READ_META, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
- goto unlock;
+ goto unlock_out;
}
gfs2_trans_add_bh(ip->i_gl, bh, 0);
kaddr = kmap_atomic(page, KM_USER0);
- qp = kaddr + offset;
- value = (s64)be64_to_cpu(qp->qu_value) + change;
- qp->qu_value = cpu_to_be64(value);
- qd->qd_qb.qb_value = qp->qu_value;
- if (fdq) {
- if (fdq->d_fieldmask & FS_DQ_BSOFT) {
- qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
- qd->qd_qb.qb_warn = qp->qu_warn;
- }
- if (fdq->d_fieldmask & FS_DQ_BHARD) {
- qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
- qd->qd_qb.qb_limit = qp->qu_limit;
- }
- }
+ if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
+ nbytes = PAGE_CACHE_SIZE - offset;
+ memcpy(kaddr + offset, ptr, nbytes);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
+ unlock_page(page);
+ page_cache_release(page);
+ /* If quota straddles page boundary, we need to update the rest of the
+ * quota at the beginning of the next page */
+ if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+ ptr = ptr + nbytes;
+ nbytes = sizeof(struct gfs2_quota) - nbytes;
+ offset = 0;
+ index++;
+ goto get_a_page;
+ }
+
+ /* Update the disk inode timestamp and size (if extended) */
err = gfs2_meta_inode_buffer(ip, &dibh);
if (err)
- goto unlock;
+ goto out;
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
brelse(dibh);
mark_inode_dirty(inode);
-unlock:
+out:
+ return err;
+unlock_out:
unlock_page(page);
page_cache_release(page);
return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
* rgrp since it won't be allocated during the transaction
*/
al->al_requested = 1;
- /* +1 in the end for block requested above for unstuffing */
- blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+ /* +3 in the end for unstuffing block, inode size update block
+ * and another block in case quota straddles page boundary and
+ * two blocks need to be updated instead of 1 */
+ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
if (nalloc)
al->al_requested += nalloc * (data_blocks + ind_blocks);
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
memset(fqs, 0, sizeof(struct fs_quota_stat));
fqs->qs_version = FS_QSTAT_VERSION;
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
- fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
- else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
- fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+
+ switch (sdp->sd_args.ar_quota) {
+ case GFS2_QUOTA_ON:
+ fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+ /*FALLTHRU*/
+ case GFS2_QUOTA_ACCOUNT:
+ fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+ break;
+ case GFS2_QUOTA_OFF:
+ break;
+ }
+
if (sdp->sd_quota_inode) {
fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842..8bce73e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -948,13 +948,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
* try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
* @rgd: The rgrp
*
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ * The inode, if one has been found, in inode.
*/
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
- u64 skip)
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+ u64 skip)
{
- struct inode *inode;
u32 goal = 0, block;
u64 no_addr;
struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +979,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
if (no_addr == skip)
continue;
*last_unlinked = no_addr;
- inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
- no_addr, -1, 1);
- if (!IS_ERR(inode))
- return inode;
+ return no_addr;
}
rgd->rd_flags &= ~GFS2_RDF_CHECK;
- return NULL;
+ return 0;
}
/**
@@ -1067,11 +1064,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
* Try to acquire rgrp in way which avoids contending with others.
*
* Returns: errno
+ * unlinked: the block address of an unlinked block to be reclaimed
*/
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+ u64 *last_unlinked)
{
- struct inode *inode = NULL;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1078,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
int loops = 0;
int error, rg_locked;
+ *unlinked = 0;
rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
@@ -1096,19 +1095,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- if (rgd->rd_flags & GFS2_RDF_CHECK)
- inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+ /* If the rg came in already locked, there's no
+ way we can recover from a failed try_rgrp_unlink
+ because that would require an iput which can only
+ happen after the rgrp is unlocked. */
+ if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+ *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+ ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (inode)
- return inode;
+ if (*unlinked)
+ return -EAGAIN;
/* fall through */
case GLR_TRYFAILED:
rgd = recent_rgrp_next(rgd);
break;
default:
- return ERR_PTR(error);
+ return error;
}
}
@@ -1130,12 +1134,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- if (rgd->rd_flags & GFS2_RDF_CHECK)
- inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+ if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+ *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+ ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (inode)
- return inode;
+ if (*unlinked)
+ return -EAGAIN;
break;
case GLR_TRYFAILED:
@@ -1143,7 +1148,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
break;
default:
- return ERR_PTR(error);
+ return error;
}
rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1157,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
if (rgd == begin) {
if (++loops >= 3)
- return ERR_PTR(-ENOSPC);
+ return -ENOSPC;
if (!skipped)
loops++;
flags = 0;
@@ -1172,7 +1177,7 @@ out:
forward_rgrp_set(sdp, rgd);
}
- return NULL;
+ return 0;
}
/**
@@ -1188,7 +1193,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
struct gfs2_alloc *al = ip->i_alloc;
struct inode *inode;
int error = 0;
- u64 last_unlinked = NO_BLOCK;
+ u64 last_unlinked = NO_BLOCK, unlinked;
if (gfs2_assert_warn(sdp, al->al_requested))
return -EINVAL;
@@ -1204,14 +1209,19 @@ try_again:
if (error)
return error;
- inode = get_local_rgrp(ip, &last_unlinked);
- if (inode) {
+ error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+ if (error) {
if (ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- iput(inode);
+ if (error != -EAGAIN)
+ return error;
+ error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb,
+ unlinked, &inode);
+ if (inode)
+ iput(inode);
gfs2_log_flush(sdp, NULL);
+ if (error == GLR_TRYFAILED)
+ error = 0;
goto try_again;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac60..4d1aad3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
int error;
spin_lock(&gt->gt_spin);
- args.ar_commit = gt->gt_log_flush_secs;
+ args.ar_commit = gt->gt_logd_secs;
args.ar_quota_quantum = gt->gt_quota_quantum;
if (gt->gt_statfs_slow)
args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
else
clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
spin_lock(&gt->gt_spin);
- gt->gt_log_flush_secs = args.ar_commit;
+ gt->gt_logd_secs = args.ar_commit;
gt->gt_quota_quantum = args.ar_quota_quantum;
if (args.ar_statfs_quantum) {
gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
}
if (args->ar_discard)
seq_printf(s, ",discard");
- val = sdp->sd_tune.gt_log_flush_secs;
- if (val != 60)
+ val = sdp->sd_tune.gt_logd_secs;
+ if (val != 30)
seq_printf(s, ",commit=%d", val);
val = sdp->sd_tune.gt_statfs_quantum;
if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
}
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
seq_printf(s, ",nobarrier");
-
+ if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+ seq_printf(s, ",demote_interface_used");
return 0;
}
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 54fd984..37f5393 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -232,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
glops = gfs2_glops_list[gltype];
if (glops == NULL)
return -EINVAL;
+ if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+ fs_info(sdp, "demote interface used\n");
rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
if (rv)
return rv;
@@ -468,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
} \
TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
TUNE_ATTR(quota_warn_period, 0);
TUNE_ATTR(quota_quantum, 0);
TUNE_ATTR(max_readahead, 0);
@@ -481,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
static struct attribute *tune_attrs[] = {
- &tune_attr_incore_log_blocks.attr,
- &tune_attr_log_flush_secs.attr,
&tune_attr_quota_warn_period.attr,
&tune_attr_quota_quantum.attr,
&tune_attr_max_readahead.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9f..9ec73a8 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
#include "meta_io.h"
#include "trans.h"
#include "util.h"
+#include "trace_gfs2.h"
int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
return error;
}
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+ atomic_add(blks, &sdp->sd_log_blks_free);
+ trace_gfs2_log_blocks(sdp, blks);
+ gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+ sdp->sd_jdesc->jd_blocks);
+ up_read(&sdp->sd_log_flush_lock);
+}
+
void gfs2_trans_end(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/inode.c b/fs/inode.c
index 407bf39..258ec22 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1205,8 +1205,6 @@ void generic_delete_inode(struct inode *inode)
inodes_stat.nr_inodes--;
spin_unlock(&inode_lock);
- security_inode_delete(inode);
-
if (op->delete_inode) {
void (*delete)(struct inode *) = op->delete_inode;
/* Filesystems implementing their own
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dc..bc2ff59 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size)
BUG_ON(i >= JBD2_MAX_SLABS);
if (unlikely(i < 0))
i = 0;
- BUG_ON(jbd2_slab[i] == 0);
+ BUG_ON(jbd2_slab[i] == NULL);
return jbd2_slab[i];
}
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da..55f1dde 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
{
- spin_lock(&c->erase_completion_lock);
+ assert_spin_locked(&c->erase_completion_lock);
if (c->gc_task && jffs2_thread_should_wake(c))
send_sig(SIGHUP, c->gc_task, 1);
- spin_unlock(&c->erase_completion_lock);
}
/* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679b..6286ad9 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
jffs2_erase_failed(c, jeb, bad_offset);
}
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
{
struct jffs2_eraseblock *jeb;
+ int work_done = 0;
mutex_lock(&c->erase_free_sem);
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
mutex_unlock(&c->erase_free_sem);
jffs2_mark_erased_block(c, jeb);
+ work_done++;
if (!--count) {
D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
mutex_unlock(&c->erase_free_sem);
done:
D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+ return work_done;
}
static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move_tail(&jeb->list, &c->erase_complete_list);
+ /* Wake the GC thread to mark them clean */
+ jffs2_garbage_collect_trigger(c);
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->erase_free_sem);
- /* Ensure that kupdated calls us again to mark them clean */
- jffs2_erase_pending_trigger(c);
+ wake_up(&c->erase_wait);
}
static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
refile:
/* Stick it back on the list from whence it came and come back later */
- jffs2_erase_pending_trigger(c);
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
+ jffs2_garbage_collect_trigger(c);
list_move(&jeb->list, &c->erase_complete_list);
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81..86e0821 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
case S_IFBLK:
case S_IFCHR:
/* Read the device numbers from the media */
- if (f->metadata->size != sizeof(jdev.old) &&
- f->metadata->size != sizeof(jdev.new)) {
+ if (f->metadata->size != sizeof(jdev.old_id) &&
+ f->metadata->size != sizeof(jdev.new_id)) {
printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
goto error_io;
}
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
goto error;
}
- if (f->metadata->size == sizeof(jdev.old))
- rdev = old_decode_dev(je16_to_cpu(jdev.old));
+ if (f->metadata->size == sizeof(jdev.old_id))
+ rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
else
- rdev = new_decode_dev(je32_to_cpu(jdev.new));
+ rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
case S_IFSOCK:
case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa..f5e96bd 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
return ret;
}
+ /* If there are any blocks which need erasing, erase them now */
+ if (!list_empty(&c->erase_complete_list) ||
+ !list_empty(&c->erase_pending_list)) {
+ spin_unlock(&c->erase_completion_lock);
+ D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+ if (jffs2_erase_pending_blocks(c, 1)) {
+ mutex_unlock(&c->alloc_sem);
+ return 0;
+ }
+ D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+ spin_lock(&c->erase_completion_lock);
+ }
+
/* First, work out which block we're garbage-collecting */
jeb = c->gcblock;
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
if (!jeb) {
/* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
- if (!list_empty(&c->erase_pending_list)) {
+ if (c->nr_erasing_blocks) {
spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
list_add_tail(&c->gcblock->list, &c->erase_pending_list);
c->gcblock = NULL;
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
}
spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6e..a881a42 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
{
if (old_valid_dev(rdev)) {
- jdev->old = cpu_to_je16(old_encode_dev(rdev));
- return sizeof(jdev->old);
+ jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
+ return sizeof(jdev->old_id);
} else {
- jdev->new = cpu_to_je32(new_encode_dev(rdev));
- return sizeof(jdev->new);
+ jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
+ return sizeof(jdev->new_id);
}
}
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
int jffs2_do_mount_fs(struct jffs2_sb_info *c);
/* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 191359d..694aa5b 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
ret = jffs2_garbage_collect_pass(c);
- if (ret == -EAGAIN)
- jffs2_erase_pending_blocks(c, 1);
- else if (ret)
+ if (ret == -EAGAIN) {
+ spin_lock(&c->erase_completion_lock);
+ if (c->nr_erasing_blocks &&
+ list_empty(&c->erase_pending_list) &&
+ list_empty(&c->erase_complete_list)) {
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&c->erase_wait, &wait);
+ D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+ spin_unlock(&c->erase_completion_lock);
+
+ schedule();
+ } else
+ spin_unlock(&c->erase_completion_lock);
+ } else if (ret)
return ret;
cond_resched();
@@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
list_move_tail(&ejeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
ejeb->offset));
}
@@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
void jffs2_complete_reservation(struct jffs2_sb_info *c)
{
D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+ spin_lock(&c->erase_completion_lock);
jffs2_garbage_collect_trigger(c);
+ spin_unlock(&c->erase_completion_lock);
mutex_unlock(&c->alloc_sem);
}
@@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
list_add_tail(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
} else {
/* Sometimes, however, we leave it elsewhere so it doesn't get
immediately reused, and we spread the load a bit. */
@@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
int nr_very_dirty = 0;
struct jffs2_eraseblock *jeb;
+ if (!list_empty(&c->erase_complete_list) ||
+ !list_empty(&c->erase_pending_list))
+ return 1;
+
if (c->unchecked_size) {
D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7..035a767 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
#endif /* WRITEBUFFER */
-/* erase.c */
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
{
OFNI_BS_2SFFJ(c)->s_dirt = 1;
}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686c..46f870d 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
ret = -EIO;
goto out;
}
- jffs2_erase_pending_trigger(c);
+ spin_lock(&c->erase_completion_lock);
+ jffs2_garbage_collect_trigger(c);
+ spin_unlock(&c->erase_completion_lock);
}
ret = 0;
out:
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e..511e2d6 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
- jffs2_garbage_collect_trigger(c);
- jffs2_erase_pending_blocks(c, 0);
jffs2_flush_wbuf_gc(c, 0);
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac..07ee154 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
struct jffs2_inodirty *new;
/* Mark the superblock dirty so that kupdated will flush... */
- jffs2_erase_pending_trigger(c);
+ jffs2_dirty_trigger(c);
if (jffs2_wbuf_pending_for_ino(c, ino))
return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
list_add_tail(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
} else {
/* Sometimes, however, we leave it elsewhere so it doesn't get
immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
list_add(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
}
if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
list_move(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
- jffs2_erase_pending_trigger(c);
+ jffs2_garbage_collect_trigger(c);
}
jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9e2f6a7..c92ea3b 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2438,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
/* check if this is a control page update for an allocation.
* if so, update the leaf to reflect the new leaf value using
- * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+ * dbSplit(); otherwise (deallocation), use dbJoin() to update
* the leaf with the new value. in addition to updating the
* leaf, dbSplit() will also split the binary buddy system of
* the leaves, if required, and bubble new values within the
diff --git a/fs/libfs.c b/fs/libfs.c
index ea9a6cc..232bea4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -547,6 +547,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
}
/**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+ const void __user *from, size_t count)
+{
+ loff_t pos = *ppos;
+ size_t res;
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= available || !count)
+ return 0;
+ if (count > available - pos)
+ count = available - pos;
+ res = copy_from_user(to + pos, from, count);
+ if (res == count)
+ return -EFAULT;
+ count -= res;
+ *ppos = pos + count;
+ return count;
+}
+
+/**
* memory_read_from_buffer - copy data from the buffer
* @to: the kernel space buffer to read to
* @count: the maximum number of bytes to read
@@ -864,6 +898,7 @@ EXPORT_SYMBOL(simple_statfs);
EXPORT_SYMBOL(simple_sync_file);
EXPORT_SYMBOL(simple_unlink);
EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
EXPORT_SYMBOL(memory_read_from_buffer);
EXPORT_SYMBOL(simple_transaction_set);
EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 2396a85..72d1893 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -12,7 +12,7 @@
* Atomic dir operations
*
* Directory operations are by default not atomic. Dentries and Inodes are
- * created/removed/altered in seperate operations. Therefore we need to do
+ * created/removed/altered in separate operations. Therefore we need to do
* a small amount of journaling.
*
* Create, link, mkdir, mknod and symlink all share the same function to do
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 93b55f3..1a9db84 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -707,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
u8 level = (__force u8)__level;
if (ino == LOGFS_INO_MASTER) {
- /* ifile has seperate areas */
+ /* ifile has separate areas */
level += LOGFS_MAX_LEVELS;
}
return (__force gc_level_t)level;
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725..ae96051 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void) \
* 12 - gc recycled blocks, long-lived data
* 13 - replacement blocks, short-lived data
*
- * Levels 1-11 are necessary for robust gc operations and help seperate
+ * Levels 1-11 are necessary for robust gc operations and help separate
* short-lived metadata from longer-lived file data. In the future,
- * file data should get seperated into several segments based on simple
+ * file data should get separated into several segments based on simple
* heuristics. Old data recycled during gc operation is expected to be
* long-lived. New data is of uncertain life expectancy. New data
* used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void) \
#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
/*
- * LogFS needs to seperate data into levels. Each level is defined as the
+ * LogFS needs to separate data into levels. Each level is defined as the
* maximal possible distance from the master inode (inode of the inode file).
* Data blocks reside on level 0, 1x indirect block on level 1, etc.
* Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
* @ds_crc: crc32 of structure starting with the next field
* @ds_ifile_levels: maximum number of levels for ifile
* @ds_iblock_levels: maximum number of levels for regular files
- * @ds_data_levels: number of seperate levels for data
+ * @ds_data_levels: number of separate levels for data
* @pad0: reserved, must be 0
* @ds_feature_incompat: incompatible filesystem features
* @ds_feature_ro_compat: read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
* @vim: life expectancy of data
*
* "Areas" are segments currently being used for writing. There is at least
- * one area per GC level. Several may be used to seperate long-living from
+ * one area per GC level. Several may be used to separate long-living from
* short-living data. If an area with unknown vim is encountered, it can
* simply be closed.
* The write buffer immediately follow this header.
diff --git a/fs/namespace.c b/fs/namespace.c
index f20cb57..88058de 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -628,7 +628,6 @@ repeat:
mnt->mnt_pinned = 0;
spin_unlock(&vfsmount_lock);
acct_auto_close_mnt(mnt);
- security_sb_umount_close(mnt);
goto repeat;
}
}
@@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
retval = 0;
}
spin_unlock(&vfsmount_lock);
- if (retval)
- security_sb_umount_busy(mnt);
up_write(&namespace_sem);
release_mounts(&umount_list);
return retval;
@@ -1435,17 +1432,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
if (cant_mount(path->dentry))
goto out_unlock;
- err = security_sb_check_sb(mnt, path);
- if (err)
- goto out_unlock;
-
- err = -ENOENT;
if (!d_unlinked(path->dentry))
err = attach_recursive_mnt(mnt, path, NULL);
out_unlock:
mutex_unlock(&path->dentry->d_inode->i_mutex);
- if (!err)
- security_sb_post_addmount(mnt, path);
return err;
}
@@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
up_write(&sb->s_umount);
if (!err) {
- security_sb_post_remount(path->mnt, flags, data);
-
spin_lock(&vfsmount_lock);
touch_mnt_namespace(path->mnt->mnt_ns);
spin_unlock(&vfsmount_lock);
@@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
touch_mnt_namespace(current->nsproxy->mnt_ns);
spin_unlock(&vfsmount_lock);
chroot_fs_refs(&root, &new);
- security_sb_post_pivotroot(&root, &new);
error = 0;
path_put(&root_parent);
path_put(&parent_path);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index acc9c49..7ec9b34 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -934,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
}
fsinfo.fattr = fattr;
- nfs_fattr_init(fattr);
error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
if (error < 0)
goto out_error;
@@ -1047,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
struct nfs_fh *mntfh)
{
struct nfs_server *server;
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr;
int error;
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
/* Get a client representation */
error = nfs_init_server(server, data);
if (error < 0)
@@ -1064,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
/* Probe the root fh to retrieve its FSID */
- error = nfs_probe_fsinfo(server, mntfh, &fattr);
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
if (error < 0)
goto error;
if (server->nfs_client->rpc_ops->version == 3) {
@@ -1077,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
server->namelen = NFS2_MAXNAMLEN;
}
- if (!(fattr.valid & NFS_ATTR_FATTR)) {
- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+ if (!(fattr->valid & NFS_ATTR_FATTR)) {
+ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
if (error < 0) {
dprintk("nfs_create_server: getattr error = %d\n", -error);
goto error;
}
}
- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
dprintk("Server FSID: %llx:%llx\n",
(unsigned long long) server->fsid.major,
@@ -1096,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
spin_unlock(&nfs_client_lock);
server->mount_time = jiffies;
+ nfs_free_fattr(fattr);
return server;
error:
+ nfs_free_fattr(fattr);
nfs_free_server(server);
return ERR_PTR(error);
}
@@ -1340,7 +1346,7 @@ error:
struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
struct nfs_fh *mntfh)
{
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr;
struct nfs_server *server;
int error;
@@ -1350,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (!server)
return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
/* set up the general RPC client */
error = nfs4_init_server(server, data);
if (error < 0)
@@ -1364,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
goto error;
/* Probe the root fh to retrieve its FSID */
- error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
+ error = nfs4_get_rootfh(server, mntfh);
if (error < 0)
goto error;
@@ -1375,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
nfs4_session_set_rwsize(server);
- error = nfs_probe_fsinfo(server, mntfh, &fattr);
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
if (error < 0)
goto error;
@@ -1389,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
server->mount_time = jiffies;
dprintk("<-- nfs4_create_server() = %p\n", server);
+ nfs_free_fattr(fattr);
return server;
error:
+ nfs_free_fattr(fattr);
nfs_free_server(server);
dprintk("<-- nfs4_create_server() = error %d\n", error);
return ERR_PTR(error);
@@ -1405,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
{
struct nfs_client *parent_client;
struct nfs_server *server, *parent_server;
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr;
int error;
dprintk("--> nfs4_create_referral_server()\n");
@@ -1414,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (!server)
return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto error;
+
parent_server = NFS_SB(data->sb);
parent_client = parent_server->nfs_client;
@@ -1443,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
/* Probe the root fh to retrieve its FSID and filehandle */
- error = nfs4_path_walk(server, mntfh, data->mnt_path);
+ error = nfs4_get_rootfh(server, mntfh);
if (error < 0)
goto error;
/* probe the filesystem info for this server filesystem */
- error = nfs_probe_fsinfo(server, mntfh, &fattr);
+ error = nfs_probe_fsinfo(server, mntfh, fattr);
if (error < 0)
goto error;
@@ -1466,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
server->mount_time = jiffies;
+ nfs_free_fattr(fattr);
dprintk("<-- nfs_create_referral_server() = %p\n", server);
return server;
error:
+ nfs_free_fattr(fattr);
nfs_free_server(server);
dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
return ERR_PTR(error);
@@ -1485,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
struct nfs_fattr *fattr)
{
struct nfs_server *server;
- struct nfs_fattr fattr_fsinfo;
+ struct nfs_fattr *fattr_fsinfo;
int error;
dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1496,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
if (!server)
return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ fattr_fsinfo = nfs_alloc_fattr();
+ if (fattr_fsinfo == NULL)
+ goto out_free_server;
+
/* Copy data from the source */
server->nfs_client = source->nfs_client;
atomic_inc(&server->nfs_client->cl_count);
@@ -1512,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
nfs_init_server_aclclient(server);
/* probe the filesystem info for this server filesystem */
- error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+ error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
if (error < 0)
goto out_free_server;
@@ -1534,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
server->mount_time = jiffies;
+ nfs_free_fattr(fattr_fsinfo);
dprintk("<-- nfs_clone_server() = %p\n", server);
return server;
out_free_server:
+ nfs_free_fattr(fattr_fsinfo);
nfs_free_server(server);
dprintk("<-- nfs_clone_server() = error %d\n", error);
return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ea61d26..3016345 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
struct nfs_delegation *freeme = NULL;
int status = 0;
- delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
+ delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
if (delegation == NULL)
return -ENOMEM;
memcpy(delegation->stateid.data, res->delegation.data,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a7bb5c6..ee9a179 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
struct nfs_entry my_entry;
- struct nfs_fh fh;
- struct nfs_fattr fattr;
- long res;
+ int res = -ENOMEM;
dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
my_entry.cookie = my_entry.prev_cookie = 0;
my_entry.eof = 0;
- my_entry.fh = &fh;
- my_entry.fattr = &fattr;
- nfs_fattr_init(&fattr);
+ my_entry.fh = nfs_alloc_fhandle();
+ my_entry.fattr = nfs_alloc_fattr();
+ if (my_entry.fh == NULL || my_entry.fattr == NULL)
+ goto out_alloc_failed;
+
desc->entry = &my_entry;
nfs_block_sillyrename(dentry);
@@ -598,7 +598,10 @@ out:
nfs_unblock_sillyrename(dentry);
if (res > 0)
res = 0;
- dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
+out_alloc_failed:
+ nfs_free_fattr(my_entry.fattr);
+ nfs_free_fhandle(my_entry.fh);
+ dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
res);
return res;
@@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
struct inode *dir;
struct inode *inode;
struct dentry *parent;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
int error;
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
parent = dget_parent(dentry);
dir = parent->d_inode;
@@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
if (NFS_STALE(inode))
goto out_bad;
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ error = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fhandle == NULL || fattr == NULL)
+ goto out_error;
+
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
if (error)
goto out_bad;
- if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
goto out_bad;
- if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+ if ((error = nfs_refresh_inode(inode, fattr)) != 0)
goto out_bad;
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
@@ -842,11 +853,21 @@ out_zap_parent:
shrink_dcache_parent(dentry);
}
d_drop(dentry);
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
dentry->d_name.name);
return 0;
+out_error:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ dput(parent);
+ dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+ __func__, dentry->d_parent->d_name.name,
+ dentry->d_name.name, error);
+ return error;
}
/*
@@ -911,9 +932,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
struct dentry *res;
struct dentry *parent;
struct inode *inode = NULL;
+ struct nfs_fh *fhandle = NULL;
+ struct nfs_fattr *fattr = NULL;
int error;
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
dfprintk(VFS, "NFS: lookup(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -923,7 +944,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
goto out;
- res = ERR_PTR(-ENOMEM);
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
/*
@@ -936,17 +956,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
goto out;
}
+ res = ERR_PTR(-ENOMEM);
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fhandle == NULL || fattr == NULL)
+ goto out;
+
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+ error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
- inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+ inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
res = (struct dentry *)inode;
if (IS_ERR(res))
goto out_unblock_sillyrename;
@@ -962,6 +988,8 @@ no_entry:
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
return res;
}
@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
smp_mb__after_atomic_dec();
}
+static void nfs_access_free_list(struct list_head *head)
+{
+ struct nfs_access_entry *cache;
+
+ while (!list_empty(head)) {
+ cache = list_entry(head->next, struct nfs_access_entry, lru);
+ list_del(&cache->lru);
+ nfs_access_free_entry(cache);
+ }
+}
+
int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
{
LIST_HEAD(head);
struct nfs_inode *nfsi;
struct nfs_access_entry *cache;
-restart:
+ if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return (nr_to_scan == 0) ? 0 : -1;
+
spin_lock(&nfs_access_lru_lock);
list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
- struct rw_semaphore *s_umount;
struct inode *inode;
if (nr_to_scan-- == 0)
break;
- s_umount = &nfsi->vfs_inode.i_sb->s_umount;
- if (!down_read_trylock(s_umount))
- continue;
- inode = igrab(&nfsi->vfs_inode);
- if (inode == NULL) {
- up_read(s_umount);
- continue;
- }
+ inode = &nfsi->vfs_inode;
spin_lock(&inode->i_lock);
if (list_empty(&nfsi->access_cache_entry_lru))
goto remove_lru_entry;
@@ -1704,61 +1737,47 @@ restart:
else {
remove_lru_entry:
list_del_init(&nfsi->access_cache_inode_lru);
+ smp_mb__before_clear_bit();
clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+ smp_mb__after_clear_bit();
}
- spin_unlock(&inode->i_lock);
- spin_unlock(&nfs_access_lru_lock);
- iput(inode);
- up_read(s_umount);
- goto restart;
}
spin_unlock(&nfs_access_lru_lock);
- while (!list_empty(&head)) {
- cache = list_entry(head.next, struct nfs_access_entry, lru);
- list_del(&cache->lru);
- nfs_access_free_entry(cache);
- }
+ nfs_access_free_list(&head);
return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
}
-static void __nfs_access_zap_cache(struct inode *inode)
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
{
- struct nfs_inode *nfsi = NFS_I(inode);
struct rb_root *root_node = &nfsi->access_cache;
- struct rb_node *n, *dispose = NULL;
+ struct rb_node *n;
struct nfs_access_entry *entry;
/* Unhook entries from the cache */
while ((n = rb_first(root_node)) != NULL) {
entry = rb_entry(n, struct nfs_access_entry, rb_node);
rb_erase(n, root_node);
- list_del(&entry->lru);
- n->rb_left = dispose;
- dispose = n;
+ list_move(&entry->lru, head);
}
nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
- spin_unlock(&inode->i_lock);
-
- /* Now kill them all! */
- while (dispose != NULL) {
- n = dispose;
- dispose = n->rb_left;
- nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
- }
}
void nfs_access_zap_cache(struct inode *inode)
{
+ LIST_HEAD(head);
+
+ if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+ return;
/* Remove from global LRU init */
- if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
- spin_lock(&nfs_access_lru_lock);
+ spin_lock(&nfs_access_lru_lock);
+ if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
list_del_init(&NFS_I(inode)->access_cache_inode_lru);
- spin_unlock(&nfs_access_lru_lock);
- }
spin_lock(&inode->i_lock);
- /* This will release the spinlock */
- __nfs_access_zap_cache(inode);
+ __nfs_access_zap_cache(NFS_I(inode), &head);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&nfs_access_lru_lock);
+ nfs_access_free_list(&head);
}
static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1809,8 +1828,8 @@ out_stale:
nfs_access_free_entry(cache);
return -ENOENT;
out_zap:
- /* This will release the spinlock */
- __nfs_access_zap_cache(inode);
+ spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
return -ENOENT;
}
@@ -1865,9 +1884,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
smp_mb__after_atomic_inc();
/* Add inode to global LRU list */
- if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
spin_lock(&nfs_access_lru_lock);
- list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+ if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+ list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+ &nfs_access_lru_list);
spin_unlock(&nfs_access_lru_lock);
}
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8d965bd..cac96bc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
- if (server->flags & NFS_MOUNT_NOAC)
- goto force_reval;
+ if (nfs_have_delegated_attributes(inode))
+ goto out_noreval;
+
if (filp->f_flags & O_DIRECT)
goto force_reval;
- if (nfsi->npages != 0)
- return 0;
- if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
- return 0;
+ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ goto force_reval;
+ if (nfs_attribute_timeout(inode))
+ goto force_reval;
+out_noreval:
+ return 0;
force_reval:
return __nfs_revalidate_inode(server, inode);
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a6b16ed..ce153a6 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
struct list_head *pages,
unsigned *nr_pages)
{
- int ret, npages = *nr_pages;
+ unsigned npages = *nr_pages;
+ int ret;
dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a6..7428f7d 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
{
struct nfs_server *server = NFS_SB(sb);
struct nfs_fsinfo fsinfo;
- struct nfs_fattr fattr;
- struct dentry *mntroot;
+ struct dentry *ret;
struct inode *inode;
int error;
/* get the actual root for this mount */
- fsinfo.fattr = &fattr;
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ return ERR_PTR(-ENOMEM);
error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
- return ERR_PTR(error);
+ ret = ERR_PTR(error);
+ goto out;
}
inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
- return ERR_CAST(inode);
+ ret = ERR_CAST(inode);
+ goto out;
}
error = nfs_superblock_set_dummy_root(sb, inode);
- if (error != 0)
- return ERR_PTR(error);
+ if (error != 0) {
+ ret = ERR_PTR(error);
+ goto out;
+ }
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
* exists, we'll pick it up at this point and use it as the root
*/
- mntroot = d_obtain_alias(inode);
- if (IS_ERR(mntroot)) {
+ ret = d_obtain_alias(inode);
+ if (IS_ERR(ret)) {
dprintk("nfs_get_root: get root dentry failed\n");
- return mntroot;
+ goto out;
}
- security_d_instantiate(mntroot, inode);
-
- if (!mntroot->d_op)
- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+ security_d_instantiate(ret, inode);
- return mntroot;
+ if (ret->d_op == NULL)
+ ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+ nfs_free_fattr(fsinfo.fattr);
+ return ret;
}
#ifdef CONFIG_NFS_V4
-/*
- * Do a simple pathwalk from the root FH of the server to the nominated target
- * of the mountpoint
- * - give error on symlinks
- * - give error on ".." occurring in the path
- * - follow traversals
- */
-int nfs4_path_walk(struct nfs_server *server,
- struct nfs_fh *mntfh,
- const char *path)
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
{
struct nfs_fsinfo fsinfo;
- struct nfs_fattr fattr;
- struct nfs_fh lastfh;
- struct qstr name;
- int ret;
+ int ret = -ENOMEM;
- dprintk("--> nfs4_path_walk(,,%s)\n", path);
+ dprintk("--> nfs4_get_rootfh()\n");
- fsinfo.fattr = &fattr;
- nfs_fattr_init(&fattr);
-
- /* Eat leading slashes */
- while (*path == '/')
- path++;
+ fsinfo.fattr = nfs_alloc_fattr();
+ if (fsinfo.fattr == NULL)
+ goto out;
/* Start by getting the root filehandle from the server */
ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
if (ret < 0) {
- dprintk("nfs4_get_root: getroot error = %d\n", -ret);
- return ret;
+ dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+ goto out;
}
- if (!S_ISDIR(fattr.mode)) {
- printk(KERN_ERR "nfs4_get_root:"
+ if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE)
+ || !S_ISDIR(fsinfo.fattr->mode)) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
" getroot encountered non-directory\n");
- return -ENOTDIR;
+ ret = -ENOTDIR;
+ goto out;
}
- /* FIXME: It is quite valid for the server to return a referral here */
- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
- printk(KERN_ERR "nfs4_get_root:"
+ if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+ printk(KERN_ERR "nfs4_get_rootfh:"
" getroot obtained referral\n");
- return -EREMOTE;
- }
-
-next_component:
- dprintk("Next: %s\n", path);
-
- /* extract the next bit of the path */
- if (!*path)
- goto path_walk_complete;
-
- name.name = path;
- while (*path && *path != '/')
- path++;
- name.len = path - (const char *) name.name;
-
- if (name.len > NFS4_MAXNAMLEN)
- return -ENAMETOOLONG;
-
-eat_dot_dir:
- while (*path == '/')
- path++;
-
- if (path[0] == '.' && (path[1] == '/' || !path[1])) {
- path += 2;
- goto eat_dot_dir;
- }
-
- /* FIXME: Why shouldn't the user be able to use ".." in the path? */
- if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
- ) {
- printk(KERN_ERR "nfs4_get_root:"
- " Mount path contains reference to \"..\"\n");
- return -EINVAL;
+ ret = -EREMOTE;
+ goto out;
}
- /* lookup the next FH in the sequence */
- memcpy(&lastfh, mntfh, sizeof(lastfh));
-
- dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
-
- ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
- mntfh, &fattr);
- if (ret < 0) {
- dprintk("nfs4_get_root: getroot error = %d\n", -ret);
- return ret;
- }
-
- if (!S_ISDIR(fattr.mode)) {
- printk(KERN_ERR "nfs4_get_root:"
- " lookupfh encountered non-directory\n");
- return -ENOTDIR;
- }
-
- /* FIXME: Referrals are quite valid here too */
- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
- printk(KERN_ERR "nfs4_get_root:"
- " lookupfh obtained referral\n");
- return -EREMOTE;
- }
-
- goto next_component;
-
-path_walk_complete:
- memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
- dprintk("<-- nfs4_path_walk() = 0\n");
- return 0;
+ memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+ nfs_free_fattr(fsinfo.fattr);
+ dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+ return ret;
}
/*
@@ -239,8 +174,8 @@ path_walk_complete:
struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
{
struct nfs_server *server = NFS_SB(sb);
- struct nfs_fattr fattr;
- struct dentry *mntroot;
+ struct nfs_fattr *fattr = NULL;
+ struct dentry *ret;
struct inode *inode;
int error;
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
return ERR_PTR(error);
}
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return ERR_PTR(-ENOMEM);;
+
/* get the actual root for this mount */
- error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+ error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
- return ERR_PTR(error);
+ ret = ERR_PTR(error);
+ goto out;
}
- inode = nfs_fhget(sb, mntfh, &fattr);
+ inode = nfs_fhget(sb, mntfh, fattr);
if (IS_ERR(inode)) {
dprintk("nfs_get_root: get root inode failed\n");
- return ERR_CAST(inode);
+ ret = ERR_CAST(inode);
+ goto out;
}
error = nfs_superblock_set_dummy_root(sb, inode);
- if (error != 0)
- return ERR_PTR(error);
+ if (error != 0) {
+ ret = ERR_PTR(error);
+ goto out;
+ }
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
* exists, we'll pick it up at this point and use it as the root
*/
- mntroot = d_obtain_alias(inode);
- if (IS_ERR(mntroot)) {
+ ret = d_obtain_alias(inode);
+ if (IS_ERR(ret)) {
dprintk("nfs_get_root: get root dentry failed\n");
- return mntroot;
+ goto out;
}
- security_d_instantiate(mntroot, inode);
+ security_d_instantiate(ret, inode);
- if (!mntroot->d_op)
- mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+ if (ret->d_op == NULL)
+ ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+ nfs_free_fattr(fattr);
dprintk("<-- nfs4_get_root()\n");
- return mntroot;
+ return ret;
}
#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a56ed..099b351 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -393,8 +393,8 @@ int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- struct nfs_fattr fattr;
- int error;
+ struct nfs_fattr *fattr;
+ int error = -ENOMEM;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -417,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
filemap_write_and_wait(inode->i_mapping);
nfs_wb_all(inode);
}
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
/*
* Return any delegations if we're going to change ACLs
*/
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
nfs_inode_return_delegation(inode);
- error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
+ error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
if (error == 0)
- nfs_refresh_inode(inode, &fattr);
+ nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
return error;
}
@@ -682,7 +688,7 @@ int
__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
int status = -ESTALE;
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -693,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+ status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
inode->i_sb->s_id,
@@ -707,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
}
- status = nfs_refresh_inode(inode, &fattr);
+ status = nfs_refresh_inode(inode, fattr);
if (status) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
@@ -723,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
(long long)NFS_FILEID(inode));
out:
+ nfs_free_fattr(fattr);
return status;
}
@@ -730,9 +742,14 @@ int nfs_attribute_timeout(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
if (nfs_have_delegated_attributes(inode))
return 0;
- return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+ return nfs_attribute_timeout(inode);
}
/**
@@ -745,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode)
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
- && !nfs_attribute_timeout(inode))
+ && !nfs_attribute_cache_expired(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
}
@@ -782,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
int ret = 0;
if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
- || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
+ || nfs_attribute_cache_expired(inode)
+ || NFS_STALE(inode)) {
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (ret < 0)
goto out;
@@ -916,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
fattr->gencount = nfs_inc_attr_generation_counter();
}
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+ struct nfs_fattr *fattr;
+
+ fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+ if (fattr != NULL)
+ nfs_fattr_init(fattr);
+ return fattr;
+}
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+ struct nfs_fh *fh;
+
+ fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+ if (fh != NULL)
+ fh->size = 0;
+ return fh;
+}
+
/**
* nfs_inode_attrs_need_update - check if the inode attributes need updating
* @inode - pointer to inode
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f0..d8bd619 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
#ifdef CONFIG_NFS_V4
extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
-extern int nfs4_path_walk(struct nfs_server *server,
- struct nfs_fh *mntfh,
- const char *path);
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
#endif
/* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c8..c583248 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
static inline void nfs_add_server_stats(const struct nfs_server *server,
enum nfs_stat_bytecounters stat,
- unsigned long addend)
+ long addend)
{
this_cpu_add(server->io_stats->bytes[stat], addend);
}
static inline void nfs_add_stats(const struct inode *inode,
enum nfs_stat_bytecounters stat,
- unsigned long addend)
+ long addend)
{
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
#ifdef CONFIG_NFS_FSCACHE
static inline void nfs_add_fscache_stats(struct inode *inode,
enum nfs_stat_fscachecounters stat,
- unsigned long addend)
+ long addend)
{
this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7888cf3..db6aa36 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
struct vfsmount *mnt;
struct nfs_server *server = NFS_SERVER(dentry->d_inode);
struct dentry *parent;
- struct nfs_fh fh;
- struct nfs_fattr fattr;
+ struct nfs_fh *fh = NULL;
+ struct nfs_fattr *fattr = NULL;
int err;
dprintk("--> nfs_follow_mountpoint()\n");
@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
if (IS_ROOT(dentry))
goto out_err;
+ err = -ENOMEM;
+ fh = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ if (fh == NULL || fattr == NULL)
+ goto out_err;
+
dprintk("%s: enter\n", __func__);
dput(nd->path.dentry);
nd->path.dentry = dget(dentry);
@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
parent = dget_parent(nd->path.dentry);
err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
&nd->path.dentry->d_name,
- &fh, &fattr);
+ fh, fattr);
dput(parent);
if (err != 0)
goto out_err;
- if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+ if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
else
- mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh,
- &fattr);
+ mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
+ fattr);
err = PTR_ERR(mnt);
if (IS_ERR(mnt))
goto out_err;
@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
nd->path.dentry = dget(mnt->mnt_root);
schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fh);
dprintk("%s: done, returned %d\n", __func__, err);
dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d150ae0..9f88c5f 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_fattr fattr;
struct page *pages[NFSACL_MAXPAGES] = { };
struct nfs3_getaclargs args = {
.fh = NFS_FH(inode),
@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
.pages = pages,
};
struct nfs3_getaclres res = {
- .fattr = &fattr,
+ 0
};
struct rpc_message msg = {
.rpc_argp = &args,
@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
dprintk("NFS call getacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
- nfs_fattr_init(&fattr);
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return ERR_PTR(-ENOMEM);
+
status = rpc_call_sync(server->client_acl, &msg, 0);
dprintk("NFS reply getacl: %d\n", status);
@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
switch (status) {
case 0:
- status = nfs_refresh_inode(inode, &fattr);
+ status = nfs_refresh_inode(inode, res.fattr);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
getout:
posix_acl_release(res.acl_access);
posix_acl_release(res.acl_default);
+ nfs_free_fattr(res.fattr);
if (status != 0) {
posix_acl_release(acl);
@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
struct posix_acl *dfacl)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr;
struct page *pages[NFSACL_MAXPAGES];
struct nfs3_setaclargs args = {
.inode = inode,
@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
}
dprintk("NFS call setacl\n");
+ status = -ENOMEM;
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out_freepages;
+
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
- nfs_fattr_init(&fattr);
+ msg.rpc_resp = fattr;
status = rpc_call_sync(server->client_acl, &msg, 0);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
switch (status) {
case 0:
- status = nfs_refresh_inode(inode, &fattr);
+ status = nfs_refresh_inode(inode, fattr);
nfs3_cache_acls(inode, acl, dfacl);
break;
case -EPFNOSUPPORT:
@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
case -ENOTSUPP:
status = -EOPNOTSUPP;
}
+ nfs_free_fattr(fattr);
out_freepages:
while (args.npages != 0) {
args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e701002..fabb4f2 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -144,14 +144,12 @@ static int
nfs3_proc_lookup(struct inode *dir, struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
- struct nfs_fattr dir_attr;
struct nfs3_diropargs arg = {
.fh = NFS_FH(dir),
.name = name->name,
.len = name->len
};
struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
.fh = fhandle,
.fattr = fattr
};
@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
int status;
dprintk("NFS call lookup %s\n", name->name);
- nfs_fattr_init(&dir_attr);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ return -ENOMEM;
+
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_refresh_inode(dir, res.dir_attr);
if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
msg.rpc_argp = fhandle;
msg.rpc_resp = fattr;
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
+ nfs_free_fattr(res.dir_attr);
dprintk("NFS reply lookup: %d\n", status);
return status;
}
static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
{
- struct nfs_fattr fattr;
struct nfs3_accessargs arg = {
.fh = NFS_FH(inode),
};
- struct nfs3_accessres res = {
- .fattr = &fattr,
- };
+ struct nfs3_accessres res;
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
.rpc_argp = &arg,
@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
.rpc_cred = entry->cred,
};
int mode = entry->mask;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call access\n");
@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
if (mode & MAY_EXEC)
arg.access |= NFS3_ACCESS_EXECUTE;
}
- nfs_fattr_init(&fattr);
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out;
+
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_refresh_inode(inode, &fattr);
+ nfs_refresh_inode(inode, res.fattr);
if (status == 0) {
entry->mask = 0;
if (res.access & NFS3_ACCESS_READ)
@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
entry->mask |= MAY_EXEC;
}
+ nfs_free_fattr(res.fattr);
+out:
dprintk("NFS reply access: %d\n", status);
return status;
}
@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
static int nfs3_proc_readlink(struct inode *inode, struct page *page,
unsigned int pgbase, unsigned int pglen)
{
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr;
struct nfs3_readlinkargs args = {
.fh = NFS_FH(inode),
.pgbase = pgbase,
@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_READLINK],
.rpc_argp = &args,
- .rpc_resp = &fattr,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call readlink\n");
- nfs_fattr_init(&fattr);
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ goto out;
+ msg.rpc_resp = fattr;
+
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_refresh_inode(inode, &fattr);
+ nfs_refresh_inode(inode, fattr);
+ nfs_free_fattr(fattr);
+out:
dprintk("NFS reply readlink: %d\n", status);
return status;
}
@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
.rpc_argp = &arg,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call remove %s\n", name->name);
- nfs_fattr_init(&res.dir_attr);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &res.dir_attr);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_free_fattr(res.dir_attr);
+out:
dprintk("NFS reply remove: %d\n", status);
return status;
}
@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (nfs3_async_handle_jukebox(task, dir))
return 0;
res = task->tk_msg.rpc_resp;
- nfs_post_op_update_inode(dir, &res->dir_attr);
+ nfs_post_op_update_inode(dir, res->dir_attr);
return 1;
}
@@ -427,7 +442,6 @@ static int
nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
struct inode *new_dir, struct qstr *new_name)
{
- struct nfs_fattr old_dir_attr, new_dir_attr;
struct nfs3_renameargs arg = {
.fromfh = NFS_FH(old_dir),
.fromname = old_name->name,
@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
.toname = new_name->name,
.tolen = new_name->len
};
- struct nfs3_renameres res = {
- .fromattr = &old_dir_attr,
- .toattr = &new_dir_attr
- };
+ struct nfs3_renameres res;
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
.rpc_argp = &arg,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
- nfs_fattr_init(&old_dir_attr);
- nfs_fattr_init(&new_dir_attr);
+
+ res.fromattr = nfs_alloc_fattr();
+ res.toattr = nfs_alloc_fattr();
+ if (res.fromattr == NULL || res.toattr == NULL)
+ goto out;
+
status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
- nfs_post_op_update_inode(old_dir, &old_dir_attr);
- nfs_post_op_update_inode(new_dir, &new_dir_attr);
+ nfs_post_op_update_inode(old_dir, res.fromattr);
+ nfs_post_op_update_inode(new_dir, res.toattr);
+out:
+ nfs_free_fattr(res.toattr);
+ nfs_free_fattr(res.fromattr);
dprintk("NFS reply rename: %d\n", status);
return status;
}
@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
static int
nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
- struct nfs_fattr dir_attr, fattr;
struct nfs3_linkargs arg = {
.fromfh = NFS_FH(inode),
.tofh = NFS_FH(dir),
.toname = name->name,
.tolen = name->len
};
- struct nfs3_linkres res = {
- .dir_attr = &dir_attr,
- .fattr = &fattr
- };
+ struct nfs3_linkres res;
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_LINK],
.rpc_argp = &arg,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call link %s\n", name->name);
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
+ res.fattr = nfs_alloc_fattr();
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.fattr == NULL || res.dir_attr == NULL)
+ goto out;
+
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- nfs_post_op_update_inode(inode, &fattr);
+ nfs_post_op_update_inode(dir, res.dir_attr);
+ nfs_post_op_update_inode(inode, res.fattr);
+out:
+ nfs_free_fattr(res.dir_attr);
+ nfs_free_fattr(res.fattr);
dprintk("NFS reply link: %d\n", status);
return status;
}
@@ -554,7 +574,7 @@ out:
static int
nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
{
- struct nfs_fattr dir_attr;
+ struct nfs_fattr *dir_attr;
struct nfs3_diropargs arg = {
.fh = NFS_FH(dir),
.name = name->name,
@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
struct rpc_message msg = {
.rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR],
.rpc_argp = &arg,
- .rpc_resp = &dir_attr,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call rmdir %s\n", name->name);
- nfs_fattr_init(&dir_attr);
+ dir_attr = nfs_alloc_fattr();
+ if (dir_attr == NULL)
+ goto out;
+
+ msg.rpc_resp = dir_attr;
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
+ nfs_post_op_update_inode(dir, dir_attr);
+ nfs_free_fattr(dir_attr);
+out:
dprintk("NFS reply rmdir: %d\n", status);
return status;
}
@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
u64 cookie, struct page *page, unsigned int count, int plus)
{
struct inode *dir = dentry->d_inode;
- struct nfs_fattr dir_attr;
__be32 *verf = NFS_COOKIEVERF(dir);
struct nfs3_readdirargs arg = {
.fh = NFS_FH(dir),
@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.pages = &page
};
struct nfs3_readdirres res = {
- .dir_attr = &dir_attr,
.verf = verf,
.plus = plus
};
@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.rpc_resp = &res,
.rpc_cred = cred
};
- int status;
+ int status = -ENOMEM;
if (plus)
msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
dprintk("NFS call readdir%s %d\n",
plus? "plus" : "", (unsigned int) cookie);
- nfs_fattr_init(&dir_attr);
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
+
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_invalidate_atime(dir);
+ nfs_refresh_inode(dir, res.dir_attr);
- nfs_refresh_inode(dir, &dir_attr);
+ nfs_free_fattr(res.dir_attr);
+out:
dprintk("NFS reply readdir: %d\n", status);
return status;
}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 56a86f6..75dcfc7 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
static int
nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
{
- return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+ return nfs3_xdr_wccstat(req, p, res->dir_attr);
}
/*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200..c538c61 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
/* nfs4proc.c */
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f071d12..3c2a172 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -115,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
char *page, char *page2,
const struct nfs4_fs_location *location)
{
+ const size_t addr_bufsize = sizeof(struct sockaddr_storage);
struct vfsmount *mnt = ERR_PTR(-ENOENT);
char *mnt_path;
unsigned int maxbuflen;
@@ -126,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
mountdata->mnt_path = mnt_path;
maxbuflen = mnt_path - 1 - page2;
+ mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+ if (mountdata->addr == NULL)
+ return ERR_PTR(-ENOMEM);
+
for (s = 0; s < location->nservers; s++) {
const struct nfs4_string *buf = &location->servers[s];
- struct sockaddr_storage addr;
if (buf->len <= 0 || buf->len >= maxbuflen)
continue;
@@ -137,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
continue;
mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
- (struct sockaddr *)&addr, sizeof(addr));
+ mountdata->addr, addr_bufsize);
if (mountdata->addrlen == 0)
continue;
- mountdata->addr = (struct sockaddr *)&addr;
rpc_set_port(mountdata->addr, NFS_PORT);
memcpy(page2, buf->data, buf->len);
@@ -156,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
if (!IS_ERR(mnt))
break;
}
+ kfree(mountdata->addr);
return mnt;
}
@@ -221,8 +225,8 @@ out:
/*
* nfs_do_refmount - handle crossing a referral on server
+ * @mnt_parent - mountpoint of referral
* @dentry - dentry of referral
- * @nd - nameidata info
*
*/
struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 071fced..70015dd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state);
/* Prevent leaks of NFSv4 errors into userland */
static int nfs4_map_errors(int err)
@@ -714,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
struct nfs4_state_owner *sp, fmode_t fmode, int flags,
- const struct iattr *attrs)
+ const struct iattr *attrs,
+ gfp_t gfp_mask)
{
struct dentry *parent = dget_parent(path->dentry);
struct inode *dir = parent->d_inode;
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_opendata *p;
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
goto err;
- p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+ p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
if (p->o_arg.seqid == NULL)
goto err_free;
path_get(path);
@@ -1060,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
{
struct nfs4_opendata *opendata;
- opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
+ opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -1648,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
if (path->dentry->d_inode != NULL)
nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
status = -ENOMEM;
- opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
+ opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
if (opendata == NULL)
goto err_put_state_owner;
@@ -1659,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
if (status != 0)
goto err_opendata_put;
- if (opendata->o_arg.open_flags & O_EXCL)
- nfs4_exclusive_attrset(opendata, sattr);
-
state = nfs4_opendata_to_nfs4_state(opendata);
status = PTR_ERR(state);
if (IS_ERR(state))
goto err_opendata_put;
if (server->caps & NFS_CAP_POSIX_LOCK)
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+ if (opendata->o_arg.open_flags & O_EXCL) {
+ nfs4_exclusive_attrset(opendata, sattr);
+
+ nfs_fattr_init(opendata->o_res.f_attr);
+ status = nfs4_do_setattr(state->inode, cred,
+ opendata->o_res.f_attr, sattr,
+ state);
+ if (status == 0)
+ nfs_setattr_update_inode(state->inode, sattr);
+ nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ }
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
*res = state;
@@ -1914,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
*
* NOTE: Caller must be holding the sp->so_owner semaphore!
*/
-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_closedata *calldata;
@@ -1933,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
};
int status = -ENOMEM;
- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+ calldata = kzalloc(sizeof(*calldata), gfp_mask);
if (calldata == NULL)
goto out;
calldata->inode = state->inode;
@@ -1941,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
calldata->arg.fh = NFS_FH(state->inode);
calldata->arg.stateid = &state->open_stateid;
/* Serialization for the sequence id */
- calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+ calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
if (calldata->arg.seqid == NULL)
goto out_free_calldata;
calldata->arg.fmode = 0;
@@ -2404,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_fattr fattr;
struct nfs4_accessargs args = {
.fh = NFS_FH(inode),
.bitmask = server->attr_bitmask,
};
struct nfs4_accessres res = {
.server = server,
- .fattr = &fattr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2438,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
if (mode & MAY_EXEC)
args.access |= NFS4_ACCESS_EXECUTE;
}
- nfs_fattr_init(&fattr);
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ return -ENOMEM;
+
status = nfs4_call_sync(server, &msg, &args, &res, 0);
if (!status) {
entry->mask = 0;
@@ -2448,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
entry->mask |= MAY_WRITE;
if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
entry->mask |= MAY_EXEC;
- nfs_refresh_inode(inode, &fattr);
+ nfs_refresh_inode(inode, res.fattr);
}
+ nfs_free_fattr(res.fattr);
return status;
}
@@ -2562,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
}
d_add(dentry, igrab(state->inode));
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- if (flags & O_EXCL) {
- struct nfs_fattr fattr;
- status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
- if (status == 0)
- nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, &fattr);
- }
if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
status = nfs4_intent_set_file(nd, &path, state, fmode);
else
@@ -2596,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
.rpc_argp = &args,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
+
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.dir_attr == NULL)
+ goto out;
- nfs_fattr_init(&res.dir_attr);
status = nfs4_call_sync(server, &msg, &args, &res, 1);
if (status == 0) {
update_changeattr(dir, &res.cinfo);
- nfs_post_op_update_inode(dir, &res.dir_attr);
+ nfs_post_op_update_inode(dir, res.dir_attr);
}
+ nfs_free_fattr(res.dir_attr);
+out:
return status;
}
@@ -2638,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
- nfs_post_op_update_inode(dir, &res->dir_attr);
+ nfs_post_op_update_inode(dir, res->dir_attr);
return 1;
}
@@ -2653,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
.new_name = new_name,
.bitmask = server->attr_bitmask,
};
- struct nfs_fattr old_fattr, new_fattr;
struct nfs4_rename_res res = {
.server = server,
- .old_fattr = &old_fattr,
- .new_fattr = &new_fattr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
.rpc_argp = &arg,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
- nfs_fattr_init(res.old_fattr);
- nfs_fattr_init(res.new_fattr);
- status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+ res.old_fattr = nfs_alloc_fattr();
+ res.new_fattr = nfs_alloc_fattr();
+ if (res.old_fattr == NULL || res.new_fattr == NULL)
+ goto out;
+ status = nfs4_call_sync(server, &msg, &arg, &res, 1);
if (!status) {
update_changeattr(old_dir, &res.old_cinfo);
nfs_post_op_update_inode(old_dir, res.old_fattr);
update_changeattr(new_dir, &res.new_cinfo);
nfs_post_op_update_inode(new_dir, res.new_fattr);
}
+out:
+ nfs_free_fattr(res.new_fattr);
+ nfs_free_fattr(res.old_fattr);
return status;
}
@@ -2702,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
.name = name,
.bitmask = server->attr_bitmask,
};
- struct nfs_fattr fattr, dir_attr;
struct nfs4_link_res res = {
.server = server,
- .fattr = &fattr,
- .dir_attr = &dir_attr,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
.rpc_argp = &arg,
.rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr();
+ res.dir_attr = nfs_alloc_fattr();
+ if (res.fattr == NULL || res.dir_attr == NULL)
+ goto out;
- nfs_fattr_init(res.fattr);
- nfs_fattr_init(res.dir_attr);
status = nfs4_call_sync(server, &msg, &arg, &res, 1);
if (!status) {
update_changeattr(dir, &res.cinfo);
nfs_post_op_update_inode(dir, res.dir_attr);
nfs_post_op_update_inode(inode, res.fattr);
}
-
+out:
+ nfs_free_fattr(res.dir_attr);
+ nfs_free_fattr(res.fattr);
return status;
}
@@ -3146,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
}
+struct nfs4_renewdata {
+ struct nfs_client *client;
+ unsigned long timestamp;
+};
+
/*
* nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
* standalone procedure for queueing an asynchronous RENEW.
*/
-static void nfs4_renew_release(void *data)
+static void nfs4_renew_release(void *calldata)
{
- struct nfs_client *clp = data;
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
if (atomic_read(&clp->cl_count) > 1)
nfs4_schedule_state_renewal(clp);
nfs_put_client(clp);
+ kfree(data);
}
-static void nfs4_renew_done(struct rpc_task *task, void *data)
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
{
- struct nfs_client *clp = data;
- unsigned long timestamp = task->tk_start;
+ struct nfs4_renewdata *data = calldata;
+ struct nfs_client *clp = data->client;
+ unsigned long timestamp = data->timestamp;
if (task->tk_status < 0) {
/* Unless we're shutting down, schedule state recovery! */
@@ -3188,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
.rpc_argp = clp,
.rpc_cred = cred,
};
+ struct nfs4_renewdata *data;
if (!atomic_inc_not_zero(&clp->cl_count))
return -EIO;
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+ data->client = clp;
+ data->timestamp = jiffies;
return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
- &nfs4_renew_ops, clp);
+ &nfs4_renew_ops, data);
}
int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3494,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
return _nfs4_async_handle_error(task, server, server->nfs_client, state);
}
-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+ unsigned short port, struct rpc_cred *cred,
+ struct nfs4_setclientid_res *res)
{
nfs4_verifier sc_verifier;
struct nfs4_setclientid setclientid = {
@@ -3504,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
.rpc_argp = &setclientid,
- .rpc_resp = clp,
+ .rpc_resp = res,
.rpc_cred = cred,
};
__be32 *p;
@@ -3547,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
return status;
}
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+ struct nfs4_setclientid_res *arg,
+ struct rpc_cred *cred)
{
struct nfs_fsinfo fsinfo;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
- .rpc_argp = clp,
+ .rpc_argp = arg,
.rpc_resp = &fsinfo,
.rpc_cred = cred,
};
@@ -3570,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
return status;
}
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+ struct nfs4_setclientid_res *arg,
+ struct rpc_cred *cred)
{
long timeout = 0;
int err;
do {
- err = _nfs4_proc_setclientid_confirm(clp, cred);
+ err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
switch (err) {
case 0:
return err;
@@ -3667,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
};
int status = 0;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
data->args.fhandle = &data->fh;
@@ -3823,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_unlockdata *p;
struct inode *inode = lsp->ls_state->inode;
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kzalloc(sizeof(*p), GFP_NOFS);
if (p == NULL)
return NULL;
p->arg.fh = NFS_FH(inode);
@@ -3961,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
if (test_bit(NFS_DELEGATED_STATE, &state->flags))
goto out;
lsp = request->fl_u.nfs4_fl.owner;
- seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+ seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
status = -ENOMEM;
if (seqid == NULL)
goto out;
@@ -3989,22 +4027,23 @@ struct nfs4_lockdata {
};
static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
- struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
+ struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+ gfp_t gfp_mask)
{
struct nfs4_lockdata *p;
struct inode *inode = lsp->ls_state->inode;
struct nfs_server *server = NFS_SERVER(inode);
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
return NULL;
p->arg.fh = NFS_FH(inode);
p->arg.fl = &p->fl;
- p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+ p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
if (p->arg.open_seqid == NULL)
goto out_free;
- p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+ p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
if (p->arg.lock_seqid == NULL)
goto out_free_seqid;
p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4158,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
dprintk("%s: begin!\n", __func__);
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
- fl->fl_u.nfs4_fl.owner);
+ fl->fl_u.nfs4_fl.owner,
+ recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
if (data == NULL)
return -ENOMEM;
if (IS_SETLKW(cmd))
@@ -4647,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
if (max_reqs != tbl->max_slots) {
ret = -ENOMEM;
new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
- GFP_KERNEL);
+ GFP_NOFS);
if (!new)
goto out;
ret = 0;
@@ -4712,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+ slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
if (!slot)
goto out;
ret = 0;
@@ -4761,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
- session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+ session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
if (!session)
return NULL;
@@ -5105,8 +5145,8 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
if (!atomic_inc_not_zero(&clp->cl_count))
return -EIO;
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- res = kzalloc(sizeof(*res), GFP_KERNEL);
+ args = kzalloc(sizeof(*args), GFP_NOFS);
+ res = kzalloc(sizeof(*res), GFP_NOFS);
if (!args || !res) {
kfree(args);
kfree(res);
@@ -5207,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
int status = -ENOMEM;
dprintk("--> %s\n", __func__);
- calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+ calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
if (calldata == NULL)
goto out;
calldata->clp = clp;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51..34acf59 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
{
+ struct nfs4_setclientid_res clid;
unsigned short port;
int status;
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
if (clp->cl_addr.ss_family == AF_INET6)
port = nfs_callback_tcpport6;
- status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
- if (status == 0)
- status = nfs4_proc_setclientid_confirm(clp, cred);
- if (status == 0)
- nfs4_schedule_state_renewal(clp);
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+ if (status != 0)
+ goto out;
+ status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+ if (status != 0)
+ goto out;
+ clp->cl_clientid = clid.clientid;
+ nfs4_schedule_state_renewal(clp);
+out:
return status;
}
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
{
struct nfs4_state_owner *sp;
- sp = kzalloc(sizeof(*sp),GFP_KERNEL);
+ sp = kzalloc(sizeof(*sp),GFP_NOFS);
if (!sp)
return NULL;
spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
{
struct nfs4_state *state;
- state = kzalloc(sizeof(*state), GFP_KERNEL);
+ state = kzalloc(sizeof(*state), GFP_NOFS);
if (!state)
return NULL;
atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
/*
* Close the current file.
*/
-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+ fmode_t fmode, gfp_t gfp_mask, int wait)
{
struct nfs4_state_owner *owner = state->owner;
int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
nfs4_put_open_state(state);
nfs4_put_state_owner(owner);
} else
- nfs4_do_close(path, state, wait);
+ nfs4_do_close(path, state, gfp_mask, wait);
}
void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(path, state, fmode, 0);
+ __nfs4_close(path, state, fmode, GFP_NOFS, 0);
}
void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(path, state, fmode, 1);
+ __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
}
/*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
struct nfs4_lock_state *lsp;
struct nfs_client *clp = state->owner->so_client;
- lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
+ lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
if (lsp == NULL)
return NULL;
rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
nfs4_put_lock_state(lsp);
}
-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
{
struct nfs_seqid *new;
- new = kmalloc(sizeof(*new), GFP_KERNEL);
+ new = kmalloc(sizeof(*new), gfp_mask);
if (new != NULL) {
new->sequence = counter;
INIT_LIST_HEAD(&new->list);
@@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
nfs4_begin_drain_session(clp);
new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
- GFP_KERNEL);
+ GFP_NOFS);
if (!new)
return -ENOMEM;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 38f3b58..6bdef28 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1504,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
hdr->replen += decode_setclientid_maxsz;
}
-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
{
__be32 *p;
p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
- p = xdr_encode_hyper(p, client_state->cl_clientid);
- xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ p = xdr_encode_hyper(p, arg->clientid);
+ xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
hdr->nops++;
hdr->replen += decode_setclientid_confirm_maxsz;
}
@@ -2324,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
/*
* a SETCLIENTID_CONFIRM request
*/
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -2334,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, req, &hdr);
- encode_setclientid_confirm(&xdr, clp, &hdr);
+ encode_setclientid_confirm(&xdr, arg, &hdr);
encode_putrootfh(&xdr, &hdr);
encode_fsinfo(&xdr, lease_bitmap, &hdr);
encode_nops(&hdr);
@@ -4397,7 +4397,7 @@ out_overflow:
return -EIO;
}
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
{
__be32 *p;
uint32_t opnum;
@@ -4417,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
if (unlikely(!p))
goto out_overflow;
- p = xdr_decode_hyper(p, &clp->cl_clientid);
- memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
+ p = xdr_decode_hyper(p, &res->clientid);
+ memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
uint32_t len;
@@ -4815,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
goto out;
if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
goto out;
- decode_getfattr(&xdr, &res->dir_attr, res->server,
+ decode_getfattr(&xdr, res->dir_attr, res->server,
!RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
@@ -5498,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
* Decode SETCLIENTID response
*/
static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
- struct nfs_client *clp)
+ struct nfs4_setclientid_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -5507,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
status = decode_compound_hdr(&xdr, &hdr);
if (!status)
- status = decode_setclientid(&xdr, clp);
+ status = decode_setclientid(&xdr, res);
return status;
}
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27..6bd19d8 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
*/
static int __init root_nfs_get_handle(void)
{
- struct nfs_fh fh;
struct sockaddr_in sin;
unsigned int auth_flav_len = 0;
struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
NFS_MNT3_VERSION : NFS_MNT_VERSION,
.protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
- .fh = &fh,
.auth_flav_len = &auth_flav_len,
};
- int status;
+ int status = -ENOMEM;
+ request.fh = nfs_alloc_fhandle();
+ if (!request.fh)
+ goto out;
set_sockaddr(&sin, servaddr, htons(mount_port));
status = nfs_mount(&request);
if (status < 0)
printk(KERN_ERR "Root-NFS: Server returned error %d "
"while mounting %s\n", status, nfs_export_path);
else {
- nfs_data.root.size = fh.size;
- memcpy(nfs_data.root.data, fh.data, fh.size);
+ nfs_data.root.size = request.fh->size;
+ memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
}
-
+ nfs_free_fhandle(request.fh);
+out:
return status;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29d9d36..a3654e5 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
{
struct nfs_page *req;
- for (;;) {
- /* try to allocate the request struct */
- req = nfs_page_alloc();
- if (req != NULL)
- break;
-
- if (fatal_signal_pending(current))
- return ERR_PTR(-ERESTARTSYS);
- yield();
- }
+ /* try to allocate the request struct */
+ req = nfs_page_alloc();
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0288be8..611bec2 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
return status;
}
+struct nfs_createdata {
+ struct nfs_createargs arg;
+ struct nfs_diropok res;
+ struct nfs_fh fhandle;
+ struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+ struct dentry *dentry, struct iattr *sattr)
+{
+ struct nfs_createdata *data;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+ if (data != NULL) {
+ data->arg.fh = NFS_FH(dir);
+ data->arg.name = dentry->d_name.name;
+ data->arg.len = dentry->d_name.len;
+ data->arg.sattr = sattr;
+ nfs_fattr_init(&data->fattr);
+ data->fhandle.size = 0;
+ data->res.fh = &data->fhandle;
+ data->res.fattr = &data->fattr;
+ }
+ return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+ kfree(data);
+}
+
static int
nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags, struct nameidata *nd)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs_diropok res = {
- .fh = &fhandle,
- .fattr = &fattr
- };
+ struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
- nfs_fattr_init(&fattr);
dprintk("NFS call create %s\n", dentry->d_name.name);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
dprintk("NFS reply create: %d\n", status);
return status;
}
@@ -264,24 +289,12 @@ static int
nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dev_t rdev)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs_diropok res = {
- .fh = &fhandle,
- .fattr = &fattr
- };
+ struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
};
- int status, mode;
+ umode_t mode;
+ int status = -ENOMEM;
dprintk("NFS call mknod %s\n", dentry->d_name.name);
@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
}
- nfs_fattr_init(&fattr);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == -EINVAL && S_ISFIFO(mode)) {
sattr->ia_mode = mode;
- nfs_fattr_init(&fattr);
+ nfs_fattr_init(data->res.fattr);
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
}
if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
dprintk("NFS reply mknod: %d\n", status);
return status;
}
@@ -398,8 +418,8 @@ static int
nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
+ struct nfs_fh *fh;
+ struct nfs_fattr *fattr;
struct nfs_symlinkargs arg = {
.fromfh = NFS_FH(dir),
.fromname = dentry->d_name.name,
@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
.rpc_proc = &nfs_procedures[NFSPROC_SYMLINK],
.rpc_argp = &arg,
};
- int status;
+ int status = -ENAMETOOLONG;
+
+ dprintk("NFS call symlink %s\n", dentry->d_name.name);
if (len > NFS2_MAXPATHLEN)
- return -ENAMETOOLONG;
+ goto out;
- dprintk("NFS call symlink %s\n", dentry->d_name.name);
+ fh = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ status = -ENOMEM;
+ if (fh == NULL || fattr == NULL)
+ goto out;
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
* filehandle size to zero indicates to nfs_instantiate that it
* should fill in the data with a LOOKUP call on the wire.
*/
- if (status == 0) {
- nfs_fattr_init(&fattr);
- fhandle.size = 0;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ if (status == 0)
+ status = nfs_instantiate(dentry, fh, fattr);
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fh);
+out:
dprintk("NFS reply symlink: %d\n", status);
return status;
}
@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
static int
nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs_diropok res = {
- .fh = &fhandle,
- .fattr = &fattr
- };
+ struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
};
- int status;
+ int status = -ENOMEM;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
- nfs_fattr_init(&fattr);
+ data = nfs_alloc_createdata(dir, dentry, sattr);
+ if (data == NULL)
+ goto out;
+ msg.rpc_argp = &data->arg;
+ msg.rpc_resp = &data->res;
+
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ nfs_free_createdata(data);
+out:
dprintk("NFS reply mkdir: %d\n", status);
return status;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360..6e2b06e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
{
- struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+ struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
if (p) {
memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
if (pagecount <= ARRAY_SIZE(p->page_array))
p->pagevec = p->page_array;
else {
- p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+ p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
if (!p->pagevec) {
mempool_free(p, nfs_rdata_mempool);
p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b4148fc..2f8b115 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_resvport, "resvport" },
{ Opt_noresvport, "noresvport" },
{ Opt_fscache, "fsc" },
- { Opt_fscache_uniq, "fsc=%s" },
{ Opt_nofscache, "nofsc" },
{ Opt_port, "port=%s" },
@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mountaddr, "mountaddr=%s" },
{ Opt_lookupcache, "lookupcache=%s" },
+ { Opt_fscache_uniq, "fsc=%s" },
{ Opt_err, NULL }
};
@@ -423,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
unsigned char blockbits;
unsigned long blockres;
struct nfs_fh *fh = NFS_FH(dentry->d_inode);
- struct nfs_fattr fattr;
- struct nfs_fsstat res = {
- .fattr = &fattr,
- };
- int error;
+ struct nfs_fsstat res;
+ int error = -ENOMEM;
+
+ res.fattr = nfs_alloc_fattr();
+ if (res.fattr == NULL)
+ goto out_err;
error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+
+ nfs_free_fattr(res.fattr);
if (error < 0)
goto out_err;
+
buf->f_type = NFS_SUPER_MAGIC;
/*
@@ -1046,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw,
kfree(mnt->fscache_uniq);
mnt->fscache_uniq = NULL;
break;
- case Opt_fscache_uniq:
- string = match_strdup(args);
- if (!string)
- goto out_nomem;
- kfree(mnt->fscache_uniq);
- mnt->fscache_uniq = string;
- mnt->options |= NFS_OPTION_FSCACHE;
- break;
/*
* options that take numeric values
@@ -1384,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw,
return 0;
};
break;
+ case Opt_fscache_uniq:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ kfree(mnt->fscache_uniq);
+ mnt->fscache_uniq = string;
+ mnt->options |= NFS_OPTION_FSCACHE;
+ break;
/*
* Special options
@@ -2172,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
int error = -ENOMEM;
data = nfs_alloc_parsed_mount_data(3);
- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+ mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
goto out_free_fh;
@@ -2247,7 +2251,7 @@ out:
kfree(data->fscache_uniq);
security_free_mnt_opts(&data->lsm_opts);
out_free_fh:
- kfree(mntfh);
+ nfs_free_fhandle(mntfh);
kfree(data);
return error;
@@ -2556,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
};
int error = -ENOMEM;
- mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+ mntfh = nfs_alloc_fhandle();
if (data == NULL || mntfh == NULL)
goto out_free_fh;
@@ -2614,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
out:
security_free_mnt_opts(&data->lsm_opts);
out_free_fh:
- kfree(mntfh);
+ nfs_free_fhandle(mntfh);
return error;
out_free:
@@ -2669,41 +2673,120 @@ out_freepage:
free_page((unsigned long)page);
}
+struct nfs_referral_count {
+ struct list_head list;
+ const struct task_struct *task;
+ unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+ struct nfs_referral_count *p;
+
+ list_for_each_entry(p, &nfs_referral_count_list, list) {
+ if (p->task == current)
+ return p;
+ }
+ return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+ struct nfs_referral_count *p, *new;
+ int ret = -ENOMEM;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ goto out;
+ new->task = current;
+ new->referral_count = 1;
+
+ ret = 0;
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ if (p != NULL) {
+ if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+ ret = -ELOOP;
+ else
+ p->referral_count++;
+ } else {
+ list_add(&new->list, &nfs_referral_count_list);
+ new = NULL;
+ }
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(new);
+out:
+ return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+ struct nfs_referral_count *p;
+
+ spin_lock(&nfs_referral_count_list_lock);
+ p = nfs_find_referral_count();
+ p->referral_count--;
+ if (p->referral_count == 0)
+ list_del(&p->list);
+ else
+ p = NULL;
+ spin_unlock(&nfs_referral_count_list_lock);
+ kfree(p);
+}
+
static int nfs_follow_remote_path(struct vfsmount *root_mnt,
const char *export_path, struct vfsmount *mnt_target)
{
+ struct nameidata *nd = NULL;
struct mnt_namespace *ns_private;
- struct nameidata nd;
struct super_block *s;
int ret;
+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (nd == NULL)
+ return -ENOMEM;
+
ns_private = create_mnt_ns(root_mnt);
ret = PTR_ERR(ns_private);
if (IS_ERR(ns_private))
goto out_mntput;
+ ret = nfs_referral_loop_protect();
+ if (ret != 0)
+ goto out_put_mnt_ns;
+
ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
- export_path, LOOKUP_FOLLOW, &nd);
+ export_path, LOOKUP_FOLLOW, nd);
+ nfs_referral_loop_unprotect();
put_mnt_ns(ns_private);
if (ret != 0)
goto out_err;
- s = nd.path.mnt->mnt_sb;
+ s = nd->path.mnt->mnt_sb;
atomic_inc(&s->s_active);
mnt_target->mnt_sb = s;
- mnt_target->mnt_root = dget(nd.path.dentry);
+ mnt_target->mnt_root = dget(nd->path.dentry);
/* Correct the device pathname */
- nfs_fix_devname(&nd.path, mnt_target);
+ nfs_fix_devname(&nd->path, mnt_target);
- path_put(&nd.path);
+ path_put(&nd->path);
+ kfree(nd);
down_write(&s->s_umount);
return 0;
+out_put_mnt_ns:
+ put_mnt_ns(ns_private);
out_mntput:
mntput(root_mnt);
out_err:
+ kfree(nd);
return ret;
}
@@ -2874,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
struct super_block *s;
struct nfs_server *server;
struct dentry *mntroot;
- struct nfs_fh mntfh;
+ struct nfs_fh *mntfh;
int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
struct nfs_sb_mountdata sb_mntdata = {
.mntflags = flags,
};
- int error;
+ int error = -ENOMEM;
dprintk("--> nfs4_referral_get_sb()\n");
+ mntfh = nfs_alloc_fhandle();
+ if (mntfh == NULL)
+ goto out_err_nofh;
+
/* create a new volume representation */
- server = nfs4_create_referral_server(data, &mntfh);
+ server = nfs4_create_referral_server(data, mntfh);
if (IS_ERR(server)) {
error = PTR_ERR(server);
goto out_err_noserver;
@@ -2916,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
nfs_fscache_get_super_cookie(s, NULL, data);
}
- mntroot = nfs4_get_root(s, &mntfh);
+ mntroot = nfs4_get_root(s, mntfh);
if (IS_ERR(mntroot)) {
error = PTR_ERR(mntroot);
goto error_splat_super;
@@ -2933,12 +3020,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
security_sb_clone_mnt_opts(data->sb, s);
+ nfs_free_fhandle(mntfh);
dprintk("<-- nfs4_referral_get_sb() = 0\n");
return 0;
out_err_nosb:
nfs_free_server(server);
out_err_noserver:
+ nfs_free_fhandle(mntfh);
+out_err_nofh:
dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
return error;
@@ -2947,6 +3037,7 @@ error_splat_super:
bdi_unregister(&server->backing_dev_info);
error_splat_bdi:
deactivate_locked_super(s);
+ nfs_free_fhandle(mntfh);
dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
return error;
}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3f..a2242af 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
struct nfs_removeres res;
struct inode *dir;
struct rpc_cred *cred;
+ struct nfs_fattr dir_attr;
};
/**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
}
nfs_sb_active(dir->i_sb);
data->args.fh = NFS_FH(dir);
- nfs_fattr_init(&data->res.dir_attr);
+ nfs_fattr_init(data->res.dir_attr);
NFS_PROTO(dir)->unlink_setup(&msg, dir);
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
goto out_free;
}
data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+ data->res.dir_attr = &data->dir_attr;
status = -EBUSY;
spin_lock(&dentry->d_lock);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 872a5ef..c2a4f71 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
.alloc = expkey_alloc,
};
-static struct svc_expkey *
-svc_expkey_lookup(struct svc_expkey *item)
+static int
+svc_expkey_hash(struct svc_expkey *item)
{
- struct cache_head *ch;
int hash = item->ek_fsidtype;
char * cp = (char*)item->ek_fsid;
int len = key_len(item->ek_fsidtype);
@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
hash &= EXPKEY_HASHMASK;
+ return hash;
+}
+
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+ struct cache_head *ch;
+ int hash = svc_expkey_hash(item);
ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
hash);
@@ -283,13 +290,7 @@ static struct svc_expkey *
svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
{
struct cache_head *ch;
- int hash = new->ek_fsidtype;
- char * cp = (char*)new->ek_fsid;
- int len = key_len(new->ek_fsidtype);
-
- hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
- hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
- hash &= EXPKEY_HASHMASK;
+ int hash = svc_expkey_hash(new);
ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
&old->h, hash);
@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = {
.alloc = svc_export_alloc,
};
-static struct svc_export *
-svc_export_lookup(struct svc_export *exp)
+static int
+svc_export_hash(struct svc_export *exp)
{
- struct cache_head *ch;
int hash;
+
hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+ return hash;
+}
+
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+ struct cache_head *ch;
+ int hash = svc_export_hash(exp);
ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
hash);
@@ -759,10 +768,7 @@ static struct svc_export *
svc_export_update(struct svc_export *new, struct svc_export *old)
{
struct cache_head *ch;
- int hash;
- hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
- hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
- hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
+ int hash = svc_export_hash(old);
ch = sunrpc_cache_update(&svc_export_cache, &new->h,
&old->h,
@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
err = 0;
finish:
kfree(new.ex_pathname);
- if (exp)
+ if (!IS_ERR_OR_NULL(exp))
exp_put(exp);
- if (fsid_key && !IS_ERR(fsid_key))
+ if (!IS_ERR_OR_NULL(fsid_key))
cache_put(&fsid_key->h, &svc_expkey_cache);
path_put(&path);
out_put_clp:
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7e32bd3..eb78e7e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
*/
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/slab.h>
#include "nfsd.h"
#include "state.h"
@@ -79,11 +80,6 @@ enum nfs_cb_opnum4 {
cb_sequence_dec_sz + \
op_dec_sz)
-struct nfs4_rpc_args {
- void *args_op;
- struct nfsd4_cb_sequence args_seq;
-};
-
/*
* Generic encode routines from fs/nfs/nfs4xdr.c
*/
@@ -428,13 +424,19 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
};
static struct rpc_version nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use. The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
.number = 1,
.nrprocs = ARRAY_SIZE(nfs4_cb_procedures),
.procs = nfs4_cb_procedures
};
static struct rpc_version * nfs_cb_version[] = {
- NULL,
&nfs_cb_version4,
};
@@ -456,15 +458,14 @@ static struct rpc_program cb_program = {
static int max_cb_time(void)
{
- return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+ return max(nfsd4_lease/10, (time_t)1) * HZ;
}
/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cb_set an atomic? */
+ * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
{
- struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_timeout timeparms = {
.to_initval = max_cb_time(),
.to_retries = 0,
@@ -476,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
.timeout = &timeparms,
.program = &cb_program,
.prognumber = cb->cb_prog,
- .version = nfs_cb_version[1]->number,
+ .version = 0,
.authflavor = clp->cl_flavor,
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
.client_name = clp->cl_principal,
@@ -486,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
return -EINVAL;
if (cb->cb_minorversion) {
- args.bc_xprt = clp->cl_cb_xprt;
+ args.bc_xprt = cb->cb_xprt;
args.protocol = XPRT_TRANSPORT_BC_TCP;
}
/* Create RPC client */
@@ -496,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
PTR_ERR(client));
return PTR_ERR(client);
}
- cb->cb_client = client;
+ nfsd4_set_callback_client(clp, client);
return 0;
}
@@ -514,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
if (task->tk_status)
warn_no_callback_path(clp, task->tk_status);
else
- atomic_set(&clp->cl_cb_conn.cb_set, 1);
- put_nfs4_client(clp);
+ atomic_set(&clp->cl_cb_set, 1);
}
static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -537,7 +537,6 @@ int set_callback_cred(void)
void do_probe_callback(struct nfs4_client *clp)
{
- struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
.rpc_argp = clp,
@@ -545,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
};
int status;
- status = rpc_call_async(cb->cb_client, &msg,
+ status = rpc_call_async(clp->cl_cb_client, &msg,
RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
&nfsd4_cb_probe_ops, (void *)clp);
- if (status) {
+ if (status)
warn_no_callback_path(clp, status);
- put_nfs4_client(clp);
- }
}
/*
* Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
*/
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
{
int status;
- BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+ BUG_ON(atomic_read(&clp->cl_cb_set));
- status = setup_callback_client(clp);
+ status = setup_callback_client(clp, cb);
if (status) {
warn_no_callback_path(clp, status);
return;
}
-
- /* the task holds a reference to the nfs4_client struct */
- atomic_inc(&clp->cl_count);
-
do_probe_callback(clp);
}
@@ -658,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
}
}
+
static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfs4_delegation *dp = calldata;
struct nfs4_client *clp = dp->dl_client;
+ struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
nfsd4_cb_done(task, calldata);
+ if (current_rpc_client == NULL) {
+ /* We're shutting down; give up. */
+ /* XXX: err, or is it ok just to fall through
+ * and rpc_restart_call? */
+ return;
+ }
+
switch (task->tk_status) {
case -EIO:
/* Network partition? */
- atomic_set(&clp->cl_cb_conn.cb_set, 0);
+ atomic_set(&clp->cl_cb_set, 0);
warn_no_callback_path(clp, task->tk_status);
+ if (current_rpc_client != task->tk_client) {
+ /* queue a callback on the new connection: */
+ nfsd4_cb_recall(dp);
+ return;
+ }
case -EBADHANDLE:
case -NFS4ERR_BAD_STATEID:
/* Race: client probably got cb_recall
@@ -677,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
break;
default:
/* success, or error we can't handle */
- goto done;
+ return;
}
if (dp->dl_retries--) {
rpc_delay(task, 2*HZ);
@@ -685,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
rpc_restart_call(task);
return;
} else {
- atomic_set(&clp->cl_cb_conn.cb_set, 0);
+ atomic_set(&clp->cl_cb_set, 0);
warn_no_callback_path(clp, task->tk_status);
}
-done:
- kfree(task->tk_msg.rpc_argp);
}
static void nfsd4_cb_recall_release(void *calldata)
{
struct nfs4_delegation *dp = calldata;
- struct nfs4_client *clp = dp->dl_client;
nfs4_put_delegation(dp);
- put_nfs4_client(clp);
}
static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -707,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
.rpc_release = nfsd4_cb_recall_release,
};
+static struct workqueue_struct *callback_wq;
+
+int nfsd4_create_callback_queue(void)
+{
+ callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+ if (!callback_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void nfsd4_destroy_callback_queue(void)
+{
+ destroy_workqueue(callback_wq);
+}
+
+/* must be called under the state lock */
+void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+{
+ struct rpc_clnt *old = clp->cl_cb_client;
+
+ clp->cl_cb_client = new;
+ /*
+ * After this, any work that saw the old value of cl_cb_client will
+ * be gone:
+ */
+ flush_workqueue(callback_wq);
+ /* So we can safely shut it down: */
+ if (old)
+ rpc_shutdown_client(old);
+}
+
/*
* called with dp->dl_count inc'ed.
*/
-void
-nfsd4_cb_recall(struct nfs4_delegation *dp)
+static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_client;
- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
- struct nfs4_rpc_args *args;
+ struct rpc_clnt *clnt = clp->cl_cb_client;
+ struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
.rpc_cred = callback_cred
};
- int status = -ENOMEM;
+ int status;
+
+ if (clnt == NULL)
+ return; /* Client is shutting down; give up. */
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- goto out;
args->args_op = dp;
msg.rpc_argp = args;
dp->dl_retries = 1;
status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
&nfsd4_cb_recall_ops, dp);
-out:
- if (status) {
- kfree(args);
- put_nfs4_client(clp);
+ if (status)
nfs4_put_delegation(dp);
- }
+}
+
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+ /* XXX: for now, just send off delegation recall. */
+ /* In future, generalize to handle any sort of callback. */
+ struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+ struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+
+ _nfsd4_cb_recall(dp);
+}
+
+
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+ queue_work(callback_wq, &dp->dl_recall.cb_work);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2ab9e85..59ec449 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
static const char *nfsd4_op_name(unsigned opnum);
/*
- * Enforce NFSv4.1 COMPOUND ordering rules.
+ * Enforce NFSv4.1 COMPOUND ordering rules:
*
- * TODO:
- * - enforce NFS4ERR_NOT_ONLY_OP,
- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ * Also note, enforced elsewhere:
+ * - SEQUENCE other than as first op results in
+ * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ * - BIND_CONN_TO_SESSION must be the only op in its compound
+ * (Will be enforced in nfsd4_bind_conn_to_session().)
+ * - DESTROY_SESSION must be the final operation in a compound, if
+ * sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ * (Enforced in nfsd4_destroy_session().)
*/
-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
{
- if (args->minorversion && args->opcnt > 0) {
- struct nfsd4_op *op = &args->ops[0];
- return (op->status == nfserr_op_illegal) ||
- (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
- }
- return true;
+ struct nfsd4_op *op = &args->ops[0];
+
+ /* These ordering requirements don't apply to NFSv4.0: */
+ if (args->minorversion == 0)
+ return nfs_ok;
+ /* This is weird, but OK, not our problem: */
+ if (args->opcnt == 0)
+ return nfs_ok;
+ if (op->status == nfserr_op_illegal)
+ return nfs_ok;
+ if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+ return nfserr_op_not_in_session;
+ if (op->opnum == OP_SEQUENCE)
+ return nfs_ok;
+ if (args->opcnt != 1)
+ return nfserr_not_only_op;
+ return nfs_ok;
}
/*
@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
resp->rqstp = rqstp;
resp->cstate.minorversion = args->minorversion;
resp->cstate.replay_owner = NULL;
+ resp->cstate.session = NULL;
fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
/* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
if (args->minorversion > nfsd_supported_minorversion)
goto out;
- if (!nfs41_op_ordering_ok(args)) {
+ status = nfs41_check_op_ordering(args);
+ if (status) {
op = &args->ops[0];
- op->status = nfserr_sequence_pos;
+ op->status = status;
goto encode_op;
}
- status = nfs_ok;
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
.op_name = "OP_SEQUENCE",
},
+ [OP_RECLAIM_COMPLETE] = {
+ .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_RECLAIM_COMPLETE",
+ },
};
static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6a8feda..12f7109 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -45,8 +45,8 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
/* Globals */
-static time_t lease_time = 90; /* default lease time */
-static time_t user_lease_time = 90;
+time_t nfsd4_lease = 90; /* default lease time */
+time_t nfsd4_grace = 90;
static time_t boot_time;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
@@ -190,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
dp->dl_vfs_file = stp->st_vfs_file;
dp->dl_type = type;
dp->dl_ident = cb->cb_ident;
- dp->dl_stateid.si_boot = get_seconds();
+ dp->dl_stateid.si_boot = boot_time;
dp->dl_stateid.si_stateownerid = current_delegid++;
dp->dl_stateid.si_fileid = 0;
dp->dl_stateid.si_generation = 0;
@@ -199,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
atomic_set(&dp->dl_count, 1);
list_add(&dp->dl_perfile, &fp->fi_delegations);
list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
return dp;
}
@@ -249,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
* SETCLIENTID state
*/
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
+
/* Hash tables for nfs4_clientid state */
#define CLIENT_HASH_BITS 4
#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
@@ -367,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
nfs4_put_stateowner(sop);
}
-static DEFINE_SPINLOCK(sessionid_lock);
#define SESSION_HASH_SIZE 512
static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
@@ -565,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
new->se_flags = cses->flags;
kref_init(&new->se_ref);
- spin_lock(&sessionid_lock);
+ spin_lock(&client_lock);
list_add(&new->se_hash, &sessionid_hashtbl[idx]);
list_add(&new->se_perclnt, &clp->cl_sessions);
- spin_unlock(&sessionid_lock);
+ spin_unlock(&client_lock);
status = nfs_ok;
out:
@@ -579,7 +582,7 @@ out_free:
goto out;
}
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
static struct nfsd4_session *
find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
{
@@ -602,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
return NULL;
}
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
static void
unhash_session(struct nfsd4_session *ses)
{
@@ -610,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
list_del(&ses->se_perclnt);
}
-static void
-release_session(struct nfsd4_session *ses)
-{
- spin_lock(&sessionid_lock);
- unhash_session(ses);
- spin_unlock(&sessionid_lock);
- nfsd4_put_session(ses);
-}
-
void
free_session(struct kref *kref)
{
@@ -634,9 +628,18 @@ free_session(struct kref *kref)
kfree(ses);
}
+/* must be called under the client_lock */
static inline void
-renew_client(struct nfs4_client *clp)
+renew_client_locked(struct nfs4_client *clp)
{
+ if (is_client_expired(clp)) {
+ dprintk("%s: client (clientid %08x/%08x) already expired\n",
+ __func__,
+ clp->cl_clientid.cl_boot,
+ clp->cl_clientid.cl_id);
+ return;
+ }
+
/*
* Move client to the end to the LRU list.
*/
@@ -647,6 +650,14 @@ renew_client(struct nfs4_client *clp)
clp->cl_time = get_seconds();
}
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+ spin_lock(&client_lock);
+ renew_client_locked(clp);
+ spin_unlock(&client_lock);
+}
+
/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
static int
STALE_CLIENTID(clientid_t *clid)
@@ -680,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
return clp;
}
-static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
- struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-
- if (clnt) {
- /*
- * Callback threads take a reference on the client, so there
- * should be no outstanding callbacks at this point.
- */
- clp->cl_cb_conn.cb_client = NULL;
- rpc_shutdown_client(clnt);
- }
-}
-
static inline void
free_client(struct nfs4_client *clp)
{
- shutdown_callback_client(clp);
- if (clp->cl_cb_xprt)
- svc_xprt_put(clp->cl_cb_xprt);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
kfree(clp->cl_principal);
@@ -709,10 +702,34 @@ free_client(struct nfs4_client *clp)
}
void
-put_nfs4_client(struct nfs4_client *clp)
+release_session_client(struct nfsd4_session *session)
{
- if (atomic_dec_and_test(&clp->cl_count))
+ struct nfs4_client *clp = session->se_client;
+
+ if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+ return;
+ if (is_client_expired(clp)) {
free_client(clp);
+ session->se_client = NULL;
+ } else
+ renew_client_locked(clp);
+ spin_unlock(&client_lock);
+ nfsd4_put_session(session);
+}
+
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+ mark_client_expired(clp);
+ list_del(&clp->cl_lru);
+ while (!list_empty(&clp->cl_sessions)) {
+ struct nfsd4_session *ses;
+ ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+ se_perclnt);
+ unhash_session(ses);
+ nfsd4_put_session(ses);
+ }
}
static void
@@ -722,9 +739,6 @@ expire_client(struct nfs4_client *clp)
struct nfs4_delegation *dp;
struct list_head reaplist;
- dprintk("NFSD: expire_client cl_count %d\n",
- atomic_read(&clp->cl_count));
-
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
while (!list_empty(&clp->cl_delegations)) {
@@ -740,20 +754,20 @@ expire_client(struct nfs4_client *clp)
list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
- list_del(&clp->cl_idhash);
- list_del(&clp->cl_strhash);
- list_del(&clp->cl_lru);
while (!list_empty(&clp->cl_openowners)) {
sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
release_openowner(sop);
}
- while (!list_empty(&clp->cl_sessions)) {
- struct nfsd4_session *ses;
- ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
- se_perclnt);
- release_session(ses);
- }
- put_nfs4_client(clp);
+ nfsd4_set_callback_client(clp, NULL);
+ if (clp->cl_cb_conn.cb_xprt)
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ list_del(&clp->cl_idhash);
+ list_del(&clp->cl_strhash);
+ spin_lock(&client_lock);
+ unhash_client_locked(clp);
+ if (atomic_read(&clp->cl_refcount) == 0)
+ free_client(clp);
+ spin_unlock(&client_lock);
}
static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -839,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
}
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
- atomic_set(&clp->cl_count, 1);
- atomic_set(&clp->cl_cb_conn.cb_set, 0);
+ atomic_set(&clp->cl_refcount, 0);
+ atomic_set(&clp->cl_cb_set, 0);
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_sessions);
INIT_LIST_HEAD(&clp->cl_lru);
+ clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
copy_verf(clp, verf);
@@ -877,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
idhashval = clientid_hashval(clp->cl_clientid.cl_id);
list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
- list_add_tail(&clp->cl_lru, &client_lru);
- clp->cl_time = get_seconds();
+ renew_client(clp);
}
static void
@@ -888,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
unsigned int strhashval;
dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
- list_del_init(&clp->cl_strhash);
list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
strhashval = clientstr_hashval(clp->cl_recdir);
- list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+ list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
renew_client(clp);
}
@@ -1327,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cs_slot->sl_seqid++; /* from 0 to 1 */
move_to_confirmed(unconf);
- /*
- * We do not support RDMA or persistent sessions
- */
- cr_ses->flags &= ~SESSION4_PERSIST;
- cr_ses->flags &= ~SESSION4_RDMA;
-
if (cr_ses->flags & SESSION4_BACK_CHAN) {
- unconf->cl_cb_xprt = rqstp->rq_xprt;
- svc_xprt_get(unconf->cl_cb_xprt);
+ unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+ svc_xprt_get(rqstp->rq_xprt);
rpc_copy_addr(
(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
sa);
@@ -1344,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cstate->minorversion;
unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
unconf->cl_cb_seq_nr = 1;
- nfsd4_probe_callback(unconf);
+ nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
}
conf = unconf;
} else {
@@ -1352,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out;
}
+ /*
+ * We do not support RDMA or persistent sessions
+ */
+ cr_ses->flags &= ~SESSION4_PERSIST;
+ cr_ses->flags &= ~SESSION4_RDMA;
+
status = alloc_init_session(rqstp, conf, cr_ses);
if (status)
goto out;
@@ -1369,6 +1382,21 @@ out:
return status;
}
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ return argp->opcnt == resp->opcnt;
+}
+
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+ if (!session)
+ return 0;
+ return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
+
__be32
nfsd4_destroy_session(struct svc_rqst *r,
struct nfsd4_compound_state *cstate,
@@ -1384,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
* - Do we need to clear any callback info from previous session?
*/
+ if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+ if (!nfsd4_last_compound_op(r))
+ return nfserr_not_only_op;
+ }
dump_sessionid(__func__, &sessionid->sessionid);
- spin_lock(&sessionid_lock);
+ spin_lock(&client_lock);
ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
if (!ses) {
- spin_unlock(&sessionid_lock);
+ spin_unlock(&client_lock);
goto out;
}
unhash_session(ses);
- spin_unlock(&sessionid_lock);
+ spin_unlock(&client_lock);
+ nfs4_lock_state();
/* wait for callbacks */
- shutdown_callback_client(ses->se_client);
+ nfsd4_set_callback_client(ses->se_client, NULL);
+ nfs4_unlock_state();
nfsd4_put_session(ses);
status = nfs_ok;
out:
@@ -1417,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (resp->opcnt != 1)
return nfserr_sequence_pos;
- spin_lock(&sessionid_lock);
+ spin_lock(&client_lock);
status = nfserr_badsession;
session = find_in_sessionid_hashtbl(&seq->sessionid);
if (!session)
@@ -1456,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
cstate->slot = slot;
cstate->session = session;
- /* Hold a session reference until done processing the compound:
- * nfsd4_put_session called only if the cstate slot is set.
- */
- nfsd4_get_session(session);
out:
- spin_unlock(&sessionid_lock);
- /* Renew the clientid on success and on replay */
+ /* Hold a session reference until done processing the compound. */
if (cstate->session) {
- nfs4_lock_state();
- renew_client(session->se_client);
- nfs4_unlock_state();
+ nfsd4_get_session(cstate->session);
+ atomic_inc(&session->se_client->cl_refcount);
}
+ spin_unlock(&client_lock);
dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
}
__be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+ if (rc->rca_one_fs) {
+ if (!cstate->current_fh.fh_dentry)
+ return nfserr_nofilehandle;
+ /*
+ * We don't take advantage of the rca_one_fs case.
+ * That's OK, it's optional, we can safely ignore it.
+ */
+ return nfs_ok;
+ }
+ nfs4_lock_state();
+ if (is_client_expired(cstate->session->se_client)) {
+ nfs4_unlock_state();
+ /*
+ * The following error isn't really legal.
+ * But we only get here if the client just explicitly
+ * destroyed the client. Surely it no longer cares what
+ * error it gets back on an operation for the dead
+ * client.
+ */
+ return nfserr_stale_clientid;
+ }
+ nfsd4_create_clid_dir(cstate->session->se_client);
+ nfs4_unlock_state();
+ return nfs_ok;
+}
+
+__be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
@@ -1631,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
status = nfserr_clid_inuse;
else {
- /* XXX: We just turn off callbacks until we can handle
- * change request correctly. */
- atomic_set(&conf->cl_cb_conn.cb_set, 0);
+ atomic_set(&conf->cl_cb_set, 0);
+ nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
expire_client(unconf);
status = nfs_ok;
@@ -1667,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
move_to_confirmed(unconf);
conf = unconf;
- nfsd4_probe_callback(conf);
+ nfsd4_probe_callback(conf, &conf->cl_cb_conn);
status = nfs_ok;
}
} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1700,12 +1757,12 @@ alloc_init_file(struct inode *ino)
INIT_LIST_HEAD(&fp->fi_hash);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
- spin_lock(&recall_lock);
- list_add(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
fp->fi_inode = igrab(ino);
fp->fi_id = current_fileid++;
fp->fi_had_conflict = false;
+ spin_lock(&recall_lock);
+ list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+ spin_unlock(&recall_lock);
return fp;
}
return NULL;
@@ -1827,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
stp->st_stateowner = sop;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = get_seconds();
+ stp->st_stateid.si_boot = boot_time;
stp->st_stateid.si_stateownerid = sop->so_id;
stp->st_stateid.si_fileid = fp->fi_id;
stp->st_stateid.si_generation = 0;
@@ -2028,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
* lock) we know the server hasn't removed the lease yet, we know
* it's safe to take a reference: */
atomic_inc(&dp->dl_count);
- atomic_inc(&dp->dl_client->cl_count);
spin_lock(&recall_lock);
list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2347,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
{
struct nfs4_delegation *dp;
struct nfs4_stateowner *sop = stp->st_stateowner;
- struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
+ int cb_up = atomic_read(&sop->so_client->cl_cb_set);
struct file_lock fl, *flp = &fl;
int status, flag = 0;
@@ -2355,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
open->op_recall = 0;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
- if (!atomic_read(&cb->cb_set))
+ if (!cb_up)
open->op_recall = 1;
flag = open->op_delegate_type;
if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2366,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
* had the chance to reclaim theirs.... */
if (locks_in_grace())
goto out;
- if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+ if (!cb_up || !sop->so_confirmed)
goto out;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2483,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
}
memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
- if (nfsd4_has_session(&resp->cstate)) {
+ if (nfsd4_has_session(&resp->cstate))
open->op_stateowner->so_confirmed = 1;
- nfsd4_create_clid_dir(open->op_stateowner->so_client);
- }
/*
* Attempt to hand out a delegation. No error return, because the
@@ -2537,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
renew_client(clp);
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
- && !atomic_read(&clp->cl_cb_conn.cb_set))
+ && !atomic_read(&clp->cl_cb_set))
goto out;
status = nfs_ok;
out:
@@ -2554,6 +2608,12 @@ nfsd4_end_grace(void)
dprintk("NFSD: end of grace period\n");
nfsd4_recdir_purge_old();
locks_end_grace(&nfsd4_manager);
+ /*
+ * Now that every NFSv4 client has had the chance to recover and
+ * to see the (possibly new, possibly shorter) lease time, we
+ * can safely set the next grace time to the current lease time:
+ */
+ nfsd4_grace = nfsd4_lease;
}
static time_t
@@ -2563,15 +2623,17 @@ nfs4_laundromat(void)
struct nfs4_stateowner *sop;
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
- time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
- time_t t, clientid_val = NFSD_LEASE_TIME;
- time_t u, test_val = NFSD_LEASE_TIME;
+ time_t cutoff = get_seconds() - nfsd4_lease;
+ time_t t, clientid_val = nfsd4_lease;
+ time_t u, test_val = nfsd4_lease;
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
if (locks_in_grace())
nfsd4_end_grace();
+ INIT_LIST_HEAD(&reaplist);
+ spin_lock(&client_lock);
list_for_each_safe(pos, next, &client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2580,12 +2642,22 @@ nfs4_laundromat(void)
clientid_val = t;
break;
}
+ if (atomic_read(&clp->cl_refcount)) {
+ dprintk("NFSD: client in use (clientid %08x)\n",
+ clp->cl_clientid.cl_id);
+ continue;
+ }
+ unhash_client_locked(clp);
+ list_add(&clp->cl_lru, &reaplist);
+ }
+ spin_unlock(&client_lock);
+ list_for_each_safe(pos, next, &reaplist) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
dprintk("NFSD: purging unused client (clientid %08x)\n",
clp->cl_clientid.cl_id);
nfsd4_remove_clid_dir(clp);
expire_client(clp);
}
- INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
list_for_each_safe(pos, next, &del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2605,7 +2677,7 @@ nfs4_laundromat(void)
list_del_init(&dp->dl_recall_lru);
unhash_delegation(dp);
}
- test_val = NFSD_LEASE_TIME;
+ test_val = nfsd4_lease;
list_for_each_safe(pos, next, &close_lru) {
sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2661,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
static int
STALE_STATEID(stateid_t *stateid)
{
- if (time_after((unsigned long)boot_time,
- (unsigned long)stateid->si_boot)) {
- dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
- STATEID_VAL(stateid));
- return 1;
- }
- return 0;
-}
-
-static int
-EXPIRED_STATEID(stateid_t *stateid)
-{
- if (time_before((unsigned long)boot_time,
- ((unsigned long)stateid->si_boot)) &&
- time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
- dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
- STATEID_VAL(stateid));
- return 1;
- }
- return 0;
-}
-
-static __be32
-stateid_error_map(stateid_t *stateid)
-{
- if (STALE_STATEID(stateid))
- return nfserr_stale_stateid;
- if (EXPIRED_STATEID(stateid))
- return nfserr_expired;
-
- dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
+ if (stateid->si_boot == boot_time)
+ return 0;
+ dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
STATEID_VAL(stateid));
- return nfserr_bad_stateid;
+ return 1;
}
static inline int
@@ -2817,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
status = nfserr_bad_stateid;
if (is_delegation_stateid(stateid)) {
dp = find_delegation_stateid(ino, stateid);
- if (!dp) {
- status = stateid_error_map(stateid);
+ if (!dp)
goto out;
- }
status = check_stateid_generation(stateid, &dp->dl_stateid,
flags);
if (status)
@@ -2833,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
*filpp = dp->dl_vfs_file;
} else { /* open or lock stateid */
stp = find_stateid(stateid, flags);
- if (!stp) {
- status = stateid_error_map(stateid);
+ if (!stp)
goto out;
- }
if (nfs4_check_fh(current_fh, stp))
goto out;
if (!stp->st_stateowner->so_confirmed)
@@ -2908,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
*/
sop = search_close_lru(stateid->si_stateownerid, flags);
if (sop == NULL)
- return stateid_error_map(stateid);
+ return nfserr_bad_stateid;
*sopp = sop;
goto check_replay;
}
@@ -3175,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!is_delegation_stateid(stateid))
goto out;
dp = find_delegation_stateid(inode, stateid);
- if (!dp) {
- status = stateid_error_map(stateid);
+ if (!dp)
goto out;
- }
status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
if (status)
goto out;
@@ -3404,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
stp->st_stateowner = sop;
get_nfs4_file(fp);
stp->st_file = fp;
- stp->st_stateid.si_boot = get_seconds();
+ stp->st_stateid.si_boot = boot_time;
stp->st_stateid.si_stateownerid = sop->so_id;
stp->st_stateid.si_fileid = fp->fi_id;
stp->st_stateid.si_generation = 0;
@@ -3976,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
printk("NFSD: Failure reading reboot recovery data\n");
}
-unsigned long
-get_nfs4_grace_period(void)
-{
- return max(user_lease_time, lease_time) * HZ;
-}
-
/*
* Since the lifetime of a delegation isn't limited to that of an open, a
* client may quite reasonably hang on to a delegation as long as it has
@@ -4008,20 +4040,27 @@ set_max_delegations(void)
static int
__nfs4_state_start(void)
{
- unsigned long grace_time;
+ int ret;
boot_time = get_seconds();
- grace_time = get_nfs4_grace_period();
- lease_time = user_lease_time;
locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
- grace_time/HZ);
+ nfsd4_grace);
+ ret = set_callback_cred();
+ if (ret)
+ return -ENOMEM;
laundry_wq = create_singlethread_workqueue("nfsd4");
if (laundry_wq == NULL)
return -ENOMEM;
- queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+ ret = nfsd4_create_callback_queue();
+ if (ret)
+ goto out_free_laundry;
+ queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
set_max_delegations();
- return set_callback_cred();
+ return 0;
+out_free_laundry:
+ destroy_workqueue(laundry_wq);
+ return ret;
}
int
@@ -4039,12 +4078,6 @@ nfs4_state_start(void)
return 0;
}
-time_t
-nfs4_lease_time(void)
-{
- return lease_time;
-}
-
static void
__nfs4_state_shutdown(void)
{
@@ -4089,6 +4122,7 @@ nfs4_state_shutdown(void)
nfs4_lock_state();
nfs4_release_reclaim();
__nfs4_state_shutdown();
+ nfsd4_destroy_callback_queue();
nfs4_unlock_state();
}
@@ -4128,21 +4162,3 @@ nfs4_recoverydir(void)
{
return user_recovery_dirname;
}
-
-/*
- * Called when leasetime is changed.
- *
- * The only way the protocol gives us to handle on-the-fly lease changes is to
- * simulate a reboot. Instead of doing that, we just wait till the next time
- * we start to register any changes in lease time. If the administrator
- * really wants to change the lease time *now*, they can go ahead and bring
- * nfsd down and then back up again after changing the lease time.
- *
- * user_lease_time is protected by nfsd_mutex since it's only really accessed
- * when nfsd is starting
- */
-void
-nfs4_reset_lease(time_t leasetime)
-{
- user_lease_time = leasetime;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 34ccf81..ac17a70 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1234,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
DECODE_TAIL;
}
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ READ32(rc->rca_one_fs);
+
+ DECODE_TAIL;
+}
+
static __be32
nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
{
@@ -1346,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
};
struct nfsd4_minorversion_ops {
@@ -1900,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
if ((buflen -= 4) < 0)
goto out_resource;
- WRITE32(NFSD_LEASE_TIME);
+ WRITE32(nfsd4_lease);
}
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
if ((buflen -= 4) < 0)
@@ -3307,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov = &rqstp->rq_res.head[0];
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
- if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
- nfsd4_store_cache_entry(resp);
- dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
- resp->cstate.slot->sl_inuse = false;
- nfsd4_put_session(resp->cstate.session);
+ if (nfsd4_has_session(cs)) {
+ if (cs->status != nfserr_replay_cache) {
+ nfsd4_store_cache_entry(resp);
+ dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+ cs->slot->sl_inuse = false;
+ }
+ /* Renew the clientid on success and on replay */
+ release_session_client(cs->session);
}
return 1;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e359107..bc3194e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -46,6 +46,7 @@ enum {
*/
#ifdef CONFIG_NFSD_V4
NFSD_Leasetime,
+ NFSD_Gracetime,
NFSD_RecoveryDir,
#endif
};
@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
#ifdef CONFIG_NFSD_V4
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
#endif
@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_MaxBlkSize] = write_maxblksize,
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = write_leasetime,
+ [NFSD_Gracetime] = write_gracetime,
[NFSD_RecoveryDir] = write_recoverydir,
#endif
};
@@ -1204,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
}
#ifdef CONFIG_NFSD_V4
-extern time_t nfs4_leasetime(void);
-
-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
{
- /* if size > 10 seconds, call
- * nfs4_reset_lease() then write out the new lease (seconds) as reply
- */
char *mesg = buf;
- int rv, lease;
+ int rv, i;
if (size > 0) {
if (nfsd_serv)
return -EBUSY;
- rv = get_int(&mesg, &lease);
+ rv = get_int(&mesg, &i);
if (rv)
return rv;
- if (lease < 10 || lease > 3600)
+ /*
+ * Some sanity checking. We don't have a reason for
+ * these particular numbers, but problems with the
+ * extremes are:
+ * - Too short: the briefest network outage may
+ * cause clients to lose all their locks. Also,
+ * the frequent polling may be wasteful.
+ * - Too long: do you really want reboot recovery
+ * to take more than an hour? Or to make other
+ * clients wait an hour before being able to
+ * revoke a dead client's locks?
+ */
+ if (i < 10 || i > 3600)
return -EINVAL;
- nfs4_reset_lease(lease);
+ *time = i;
}
- return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
- nfs4_lease_time());
+ return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
+}
+
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __nfsd4_write_time(file, buf, size, time);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
}
/**
@@ -1252,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
*/
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
- ssize_t rv;
+ return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
- mutex_lock(&nfsd_mutex);
- rv = __write_leasetime(file, buf, size);
- mutex_unlock(&nfsd_mutex);
- return rv;
+/**
+ * write_gracetime - Set or report current NFSv4 grace period time
+ *
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this. (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+ return nfsd4_write_time(file, buf, size, &nfsd4_grace);
}
extern char *nfs4_recoverydir(void);
@@ -1351,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
#endif
/* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1a..7237776 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
void nfs4_state_shutdown(void);
-time_t nfs4_lease_time(void);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
#else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
static inline void nfs4_state_shutdown(void) { }
-static inline time_t nfs4_lease_time(void) { return 0; }
static inline void nfs4_reset_lease(time_t leasetime) { }
static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
#endif
@@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot;
#ifdef CONFIG_NFSD_V4
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
+
/* before processing a COMPOUND operation, we have to check that there
* is enough space in the buffer for XDR encode to succeed. otherwise,
* we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot;
#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
-#define NFSD_LEASE_TIME (nfs4_lease_time())
#define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
/*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699e..06b2a26 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
int nfsd_vers(int vers, enum vers_op change)
{
if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
- return -1;
+ return 0;
switch(change) {
case NFSD_SET:
nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae2..006c842 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
struct nfs4_client *cbs_clp;
};
+struct nfs4_rpc_args {
+ void *args_op;
+ struct nfsd4_cb_sequence args_seq;
+};
+
+struct nfsd4_callback {
+ struct nfs4_rpc_args cb_args;
+ struct work_struct cb_work;
+};
+
struct nfs4_delegation {
struct list_head dl_perfile;
struct list_head dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
stateid_t dl_stateid;
struct knfsd_fh dl_fh;
int dl_retries;
+ struct nfsd4_callback dl_recall;
};
/* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
u32 cb_prog;
u32 cb_minorversion;
u32 cb_ident; /* minorversion 0 only */
- /* RPC client info */
- atomic_t cb_set; /* successful CB_NULL call */
- struct rpc_clnt * cb_client;
+ struct svc_xprt *cb_xprt; /* minorversion 1 only */
};
/* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
struct list_head se_hash; /* hash by sessionid */
struct list_head se_perclnt;
u32 se_flags;
- struct nfs4_client *se_client; /* for expire_client */
+ struct nfs4_client *se_client;
struct nfs4_sessionid se_sessionid;
struct nfsd4_channel_attrs se_fchannel;
struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
struct svc_cred cl_cred; /* setclientid principal */
clientid_t cl_clientid; /* generated by server */
nfs4_verifier cl_confirm; /* generated by server */
- struct nfs4_cb_conn cl_cb_conn; /* callback info */
- atomic_t cl_count; /* ref count */
u32 cl_firststate; /* recovery dir creation */
+ /* for v4.0 and v4.1 callbacks: */
+ struct nfs4_cb_conn cl_cb_conn;
+ struct rpc_clnt *cl_cb_client;
+ atomic_t cl_cb_set;
+
/* for nfs41 */
struct list_head cl_sessions;
struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */
u32 cl_exchange_flags;
struct nfs4_sessionid cl_sessionid;
+ /* number of rpc's in progress over an associated session: */
+ atomic_t cl_refcount;
/* for nfs41 callbacks */
/* We currently support a single back channel with a single slot */
unsigned long cl_cb_slot_busy;
u32 cl_cb_seq_nr;
- struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
};
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+ clp->cl_time = 0;
+}
+
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+ return clp->cl_time == 0;
+}
+
/* struct nfs4_client_reset
* one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
* upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
extern int nfs4_in_grace(void);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void put_nfs4_client(struct nfs4_client *clp);
extern void nfs4_free_stateowner(struct kref *kref);
extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
extern void nfsd4_recdir_purge_old(void);
extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
static inline void
nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6dd5f19..23c06f7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -724,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
struct inode *inode;
int flags = O_RDONLY|O_LARGEFILE;
__be32 err;
- int host_err;
+ int host_err = 0;
validate_process_creds();
@@ -761,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* Check to see if there are any leases on this file.
* This may block while leases are broken.
*/
- host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
+ if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
+ host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
if (host_err == -EWOULDBLOCK)
host_err = -ETIMEDOUT;
if (host_err) /* NOMEM or WOULDBLOCK */
@@ -1169,7 +1170,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+ err = nfsd_open(rqstp, fhp, S_IFREG,
+ NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
if (err)
goto out;
if (EX_ISSYNC(fhp->fh_export)) {
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a..217a62c 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
#define NFSD_MAY_OWNER_OVERRIDE 64
#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa3377..4d476ff 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
struct nfs4_sessionid sessionid;
};
+struct nfsd4_reclaim_complete {
+ u32 rca_one_fs;
+};
+
struct nfsd4_op {
int opnum;
__be32 status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
struct nfsd4_create_session create_session;
struct nfsd4_destroy_session destroy_session;
struct nfsd4_sequence sequence;
+ struct nfsd4_reclaim_complete reclaim_complete;
} u;
struct nfs4_replay * replay;
};
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *,
-struct nfsd4_exchange_id *);
- extern __be32 nfsd4_create_session(struct svc_rqst *,
+ struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_create_session *);
extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
extern __be32 nfsd4_destroy_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
struct nfsd4_open *open);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7cfb87e..d7fd696 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -31,6 +31,11 @@
#include "alloc.h"
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ * descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
static inline unsigned long
nilfs_palloc_groups_per_desc_block(const struct inode *inode)
{
@@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
sizeof(struct nilfs_palloc_group_desc);
}
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
static inline unsigned long
nilfs_palloc_groups_count(const struct inode *inode)
{
return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
}
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
return 0;
}
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
unsigned long *offset)
{
@@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
return group;
}
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
static unsigned long
nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
{
@@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
}
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
static unsigned long
nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
{
@@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
}
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
static unsigned long
nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
const struct nilfs_palloc_group_desc *desc)
@@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
return nfree;
}
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
static void
nilfs_palloc_group_desc_add_entries(struct inode *inode,
unsigned long group,
@@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
spin_unlock(nilfs_mdt_bgl_lock(inode, group));
}
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
static unsigned long
nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
{
@@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
group_offset / NILFS_MDT(inode)->mi_entries_per_block;
}
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
static void nilfs_palloc_desc_block_init(struct inode *inode,
struct buffer_head *bh, void *kaddr)
{
@@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
return ret;
}
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
static int nilfs_palloc_get_desc_block(struct inode *inode,
unsigned long group,
int create, struct buffer_head **bhp)
@@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
bhp, &cache->prev_desc, &cache->lock);
}
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
static int nilfs_palloc_get_bitmap_block(struct inode *inode,
unsigned long group,
int create, struct buffer_head **bhp)
@@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
&cache->prev_bitmap, &cache->lock);
}
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
int create, struct buffer_head **bhp)
{
@@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
&cache->prev_entry, &cache->lock);
}
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
static struct nilfs_palloc_group_desc *
nilfs_palloc_block_get_group_desc(const struct inode *inode,
unsigned long group,
@@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
group % nilfs_palloc_groups_per_desc_block(inode);
}
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
const struct buffer_head *bh, void *kaddr)
{
@@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
entry_offset * NILFS_MDT(inode)->mi_entry_size;
}
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
static int nilfs_palloc_find_available_slot(struct inode *inode,
unsigned long group,
unsigned long target,
unsigned char *bitmap,
- int bsize) /* size in bits */
+ int bsize)
{
int curr, pos, end, i;
@@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
return -ENOSPC;
}
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ * in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
static unsigned long
nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
unsigned long curr, unsigned long max)
@@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
max - curr + 1);
}
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
return ret;
}
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
void nilfs_palloc_commit_alloc_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
brelse(req->pr_desc_bh);
}
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
void nilfs_palloc_commit_free_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
brelse(req->pr_desc_bh);
}
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
void nilfs_palloc_abort_alloc_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
req->pr_desc_bh = NULL;
}
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
int nilfs_palloc_prepare_free_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
return 0;
}
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
void nilfs_palloc_abort_free_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
@@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
req->pr_desc_bh = NULL;
}
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
static int
nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
{
@@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
return (nr >= first) && (nr <= last);
}
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
{
struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf87..9af34a7 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
#include <linux/buffer_head.h>
#include <linux/fs.h>
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
static inline unsigned long
nilfs_palloc_entries_per_group(const struct inode *inode)
{
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 76c38e3..b27a342 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
#include "alloc.h"
#include "dat.h"
-/**
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
- struct buffer_head *bp_bh;
- struct buffer_head *bp_sib_bh;
- int bp_index;
- union nilfs_bmap_ptr_req bp_oldreq;
- union nilfs_bmap_ptr_req bp_newreq;
- struct nilfs_btnode_chkey_ctxt bp_ctxt;
- void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
- int, __u64 *, __u64 *);
-};
-
-/*
- * B-tree path operations
- */
-
-static struct kmem_cache *nilfs_btree_path_cache;
-
-int __init nilfs_btree_path_cache_init(void)
-{
- nilfs_btree_path_cache =
- kmem_cache_create("nilfs2_btree_path_cache",
- sizeof(struct nilfs_btree_path) *
- NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
- return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-
-void nilfs_btree_path_cache_destroy(void)
-{
- kmem_cache_destroy(nilfs_btree_path_cache);
-}
-
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
- return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
- kmem_cache_free(nilfs_btree_path_cache, path);
-}
+ struct nilfs_btree_path *path;
+ int level = NILFS_BTREE_LEVEL_DATA;
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
-{
- int level;
+ path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+ if (path == NULL)
+ goto out;
- for (level = NILFS_BTREE_LEVEL_DATA;
- level < NILFS_BTREE_LEVEL_MAX;
- level++) {
+ for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
path[level].bp_bh = NULL;
path[level].bp_sib_bh = NULL;
path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
path[level].bp_op = NULL;
}
+
+out:
+ return path;
}
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
{
- int level;
+ int level = NILFS_BTREE_LEVEL_DATA;
- for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
- level++)
+ for (; level < NILFS_BTREE_LEVEL_MAX; level++)
brelse(path[level].bp_bh);
+
+ kmem_cache_free(nilfs_btree_path_cache, path);
}
/*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ptrp != NULL)
*ptrp = ptr;
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
+
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ret < 0)
goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
*ptrp = ptr;
ret = cnt;
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
}
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
}
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
+
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
}
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
if (buffer_nilfs_node(bh)) {
node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
nilfs_btree_propagate_p(btree, path, level, bh);
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
if (buffer_nilfs_node(*bh)) {
node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
nilfs_bmap_set_dirty(&btree->bt_bmap);
out:
- nilfs_btree_release_path(path);
nilfs_btree_free_path(path);
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84..af638d5 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
#include "btnode.h"
#include "bmap.h"
-struct nilfs_btree;
-struct nilfs_btree_path;
-
/**
* struct nilfs_btree - B-tree structure
* @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
struct nilfs_bmap bt_bmap;
};
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+ struct buffer_head *bp_bh;
+ struct buffer_head *bp_sib_bh;
+ int bp_index;
+ union nilfs_bmap_ptr_req bp_oldreq;
+ union nilfs_bmap_ptr_req bp_newreq;
+ struct nilfs_btnode_chkey_ctxt bp_ctxt;
+ void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+ int, __u64 *, __u64 *);
+};
#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
@@ -57,6 +73,7 @@ struct nilfs_btree {
#define NILFS_BTREE_KEY_MIN ((__u64)0)
#define NILFS_BTREE_KEY_MAX (~(__u64)0)
+extern struct kmem_cache *nilfs_btree_path_cache;
int nilfs_btree_path_cache_init(void);
void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0957b58..5e226d4 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -451,7 +451,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
inode->i_op = &nilfs_special_inode_operations;
init_special_inode(
inode, inode->i_mode,
- new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+ huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
}
nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
brelse(bh);
@@ -511,7 +511,7 @@ void nilfs_write_inode_common(struct inode *inode,
nilfs_bmap_write(ii->i_bmap, raw_inode);
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_device_code =
- cpu_to_le64(new_encode_dev(inode->i_rdev));
+ cpu_to_le64(huge_encode_dev(inode->i_rdev));
/* When extending inode, nilfs->ns_inode_size should be checked
for substitutions of appended fields */
}
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba43146..bae2a51 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -105,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+
+ /* need to verify ->ss_bytes field if read ->ss_cno */
}
/**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 17851f7..2e6a272 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,35 +40,10 @@ struct nilfs_write_info {
sector_t blocknr;
};
-
static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
struct the_nilfs *nilfs);
static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-
-static struct kmem_cache *nilfs_segbuf_cachep;
-
-static void nilfs_segbuf_init_once(void *obj)
-{
- memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-
-int __init nilfs_init_segbuf_cache(void)
-{
- nilfs_segbuf_cachep =
- kmem_cache_create("nilfs2_segbuf_cache",
- sizeof(struct nilfs_segment_buffer),
- 0, SLAB_RECLAIM_ACCOUNT,
- nilfs_segbuf_init_once);
-
- return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-
-void nilfs_destroy_segbuf_cache(void)
-{
- kmem_cache_destroy(nilfs_segbuf_cachep);
-}
-
struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
{
struct nilfs_segment_buffer *segbuf;
@@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
INIT_LIST_HEAD(&segbuf->sb_list);
INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+ segbuf->sb_super_root = NULL;
init_completion(&segbuf->sb_bio_event);
atomic_set(&segbuf->sb_err, 0);
@@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
}
int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
- time_t ctime)
+ time_t ctime, __u64 cno)
{
int err;
@@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
segbuf->sb_sum.ctime = ctime;
+ segbuf->sb_sum.cno = cno;
return 0;
}
@@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
raw_sum->ss_pad = 0;
+ raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno);
}
/*
* CRC calculation routines
*/
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
- u32 seed)
+static void
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
{
struct buffer_head *bh;
struct nilfs_segment_summary *raw_sum;
@@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
raw_sum->ss_sumsum = cpu_to_le32(crc);
}
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
- u32 seed)
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+ u32 seed)
{
struct buffer_head *bh;
struct nilfs_segment_summary *raw_sum;
@@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
raw_sum->ss_datasum = cpu_to_le32(crc);
}
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+ u32 seed)
+{
+ struct nilfs_super_root *raw_sr;
+ u32 crc;
+
+ raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+ crc = crc32_le(seed,
+ (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+ NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+ raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
static void nilfs_release_buffers(struct list_head *list)
{
struct buffer_head *bh, *n;
@@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
{
nilfs_release_buffers(&segbuf->sb_segsum_buffers);
nilfs_release_buffers(&segbuf->sb_payload_buffers);
+ segbuf->sb_super_root = NULL;
}
/*
@@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs)
return ret;
}
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+ struct nilfs_segment_buffer *segbuf;
+
+ list_for_each_entry(segbuf, logs, sb_list) {
+ if (segbuf->sb_super_root)
+ nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+ nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+ nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+ }
+}
+
/*
* BIO operations
*/
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd35..fdf1c3b 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
* @sumbytes: Byte count of segment summary
* @nfileblk: Total number of file blocks
* @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
* @ctime: Creation time
* @next: Block number of the next full segment
*/
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
unsigned long sumbytes;
unsigned long nfileblk;
u64 seg_seq;
+ __u64 cno;
time_t ctime;
sector_t next;
};
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
* @sb_rest_blocks: Number of residual blocks in the current segment
* @sb_segsum_buffers: List of buffers for segment summaries
* @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
* @sb_nbio: Number of flying bio requests
* @sb_err: I/O error status
* @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
/* Buffers */
struct list_head sb_segsum_buffers;
struct list_head sb_payload_buffers; /* including super root */
+ struct buffer_head *sb_super_root;
/* io status */
int sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
b_assoc_buffers))
#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
+extern struct kmem_cache *nilfs_segbuf_cachep;
int __init nilfs_init_segbuf_cache(void);
void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
struct nilfs_segment_buffer *prev);
void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
struct buffer_head **);
void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
static inline void
nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
struct nilfs_segment_buffer *last);
int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
static inline void nilfs_destroy_logs(struct list_head *logs)
{
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6a7dbd8..c920164 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
- nilfs_transaction_cachep =
- kmem_cache_create("nilfs2_transaction_cache",
- sizeof(struct nilfs_transaction_info),
- 0, SLAB_RECLAIM_ACCOUNT, NULL);
- return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-
-/**
- * nilfs_destroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
- kmem_cache_destroy(nilfs_transaction_cachep);
-}
-
static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
{
struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
if (nilfs_doing_gc())
flags = NILFS_SS_GC;
- err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+ err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+ sci->sc_sbi->s_nilfs->ns_cno);
if (unlikely(err))
return err;
@@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
return err;
segbuf = sci->sc_curseg;
}
- err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+ err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
if (likely(!err))
segbuf->sb_sum.flags |= NILFS_SS_SR;
return err;
@@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
*vblocknr = binfo->bi_v.bi_vblocknr;
}
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_file_bmap,
@@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
*binfo_dat = binfo->bi_dat;
}
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
.collect_data = nilfs_collect_dat_data,
.collect_node = nilfs_collect_file_node,
.collect_bmap = nilfs_collect_dat_bmap,
@@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
.write_node_binfo = nilfs_write_dat_node_binfo,
};
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
.collect_data = nilfs_collect_file_data,
.collect_node = NULL,
.collect_bmap = NULL,
@@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
}
}
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
- struct nilfs_super_root *raw_sr =
- (struct nilfs_super_root *)bh_sr->b_data;
- u32 crc;
-
- crc = crc32_le(seed,
- (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
- NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
- raw_sr->sr_sum = cpu_to_le32(crc);
-}
-
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
- u32 seed)
-{
- struct nilfs_segment_buffer *segbuf;
-
- if (sci->sc_super_root)
- nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-
- list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
- nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
- nilfs_segbuf_fill_in_data_crc(segbuf, seed);
- }
-}
-
static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct buffer_head *bh_sr = sci->sc_super_root;
- struct nilfs_super_root *raw_sr =
- (struct nilfs_super_root *)bh_sr->b_data;
+ struct buffer_head *bh_sr;
+ struct nilfs_super_root *raw_sr;
unsigned isz = nilfs->ns_inode_size;
+ bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+ raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+
raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
raw_sr->sr_nongc_ctime
= cpu_to_le64(nilfs_doing_gc() ?
@@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
/* Collection retry loop */
for (;;) {
- sci->sc_super_root = NULL;
sci->sc_nblk_this_inc = 0;
sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
@@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
ssp.offset = sizeof(struct nilfs_segment_summary);
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- if (bh == sci->sc_super_root)
+ if (bh == segbuf->sb_super_root)
break;
if (!finfo) {
finfo = nilfs_segctor_map_segsum_entry(
@@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- if (bh == sci->sc_super_root) {
+ if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
clear_page_dirty_for_io(bd_page);
@@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
}
static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
- struct buffer_head *bh_sr, int err)
+ int err)
{
struct nilfs_segment_buffer *segbuf;
struct page *bd_page = NULL, *fs_page = NULL;
@@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- if (bh == bh_sr) {
+ if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
@@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
list_splice_tail_init(&sci->sc_write_logs, &logs);
ret = nilfs_wait_on_logs(&logs);
- nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
+ nilfs_abort_logs(&logs, NULL, ret ? : err);
list_splice_tail_init(&sci->sc_segbufs, &logs);
nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
}
nilfs_destroy_logs(&logs);
- sci->sc_super_root = NULL;
}
static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
struct nilfs_segment_buffer *segbuf;
struct page *bd_page = NULL, *fs_page = NULL;
struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
- int update_sr = (sci->sc_super_root != NULL);
+ int update_sr = false;
list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
clear_buffer_nilfs_volatile(bh);
- if (bh == sci->sc_super_root) {
+ if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
}
+ update_sr = true;
break;
}
if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct nilfs_sb_info *sbi = sci->sc_sbi;
struct the_nilfs *nilfs = sbi->s_nilfs;
struct page *failed_page;
- int err, has_sr = 0;
+ int err;
sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
goto failed;
- has_sr = (sci->sc_super_root != NULL);
-
/* Avoid empty segment */
if (sci->sc_stage.scnt == NILFS_ST_DONE &&
NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
- if (has_sr) {
+ if (mode == SC_LSEG_SR &&
+ sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
/* Write partial segments */
err = nilfs_segctor_prepare_write(sci, &failed_page);
if (err) {
- nilfs_abort_logs(&sci->sc_segbufs, failed_page,
- sci->sc_super_root, err);
+ nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
goto failed_to_write;
}
- nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+
+ nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+ nilfs->ns_crc_seed);
err = nilfs_segctor_write(sci, nilfs);
if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
}
} while (sci->sc_stage.scnt != NILFS_ST_DONE);
- sci->sc_super_root = NULL;
-
out:
nilfs_segctor_check_out_files(sci, sbi);
return err;
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
{
spin_lock(&sci->sc_state_lock);
- if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
- sci->sc_timer->expires = jiffies + sci->sc_interval;
- add_timer(sci->sc_timer);
+ if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+ sci->sc_timer.expires = jiffies + sci->sc_interval;
+ add_timer(&sci->sc_timer);
sci->sc_state |= NILFS_SEGCTOR_COMMIT;
}
spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
spin_lock(&sci->sc_state_lock);
sci->sc_seq_accepted = sci->sc_seq_request;
spin_unlock(&sci->sc_state_lock);
-
- if (sci->sc_timer)
- del_timer_sync(sci->sc_timer);
+ del_timer_sync(&sci->sc_timer);
}
/**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
sci->sc_flush_request &= ~FLUSH_DAT_BIT;
/* re-enable timer if checkpoint creation was not done */
- if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
- time_before(jiffies, sci->sc_timer->expires))
- add_timer(sci->sc_timer);
+ if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+ time_before(jiffies, sci->sc_timer.expires))
+ add_timer(&sci->sc_timer);
}
spin_unlock(&sci->sc_state_lock);
}
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
{
struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
- struct timer_list timer;
int timeout = 0;
- init_timer(&timer);
- timer.data = (unsigned long)current;
- timer.function = nilfs_construction_timeout;
- sci->sc_timer = &timer;
+ sci->sc_timer.data = (unsigned long)current;
+ sci->sc_timer.function = nilfs_construction_timeout;
/* start sync. */
sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
should_sleep = 0;
else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
should_sleep = time_before(jiffies,
- sci->sc_timer->expires);
+ sci->sc_timer.expires);
if (should_sleep) {
spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
}
finish_wait(&sci->sc_wait_daemon, &wait);
timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
- time_after_eq(jiffies, sci->sc_timer->expires));
+ time_after_eq(jiffies, sci->sc_timer.expires));
if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
end_thread:
spin_unlock(&sci->sc_state_lock);
- del_timer_sync(sci->sc_timer);
- sci->sc_timer = NULL;
/* end sync. */
sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
}
}
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
- sci->sc_seq_done = sci->sc_seq_request;
-
- return nilfs_segctor_start_thread(sci);
-}
-
/*
* Setup & clean-up functions
*/
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
INIT_LIST_HEAD(&sci->sc_copied_buffers);
+ init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
down_write(&sbi->s_nilfs->ns_segctor_sem);
+ del_timer_sync(&sci->sc_timer);
kfree(sci);
}
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
return -ENOMEM;
nilfs_attach_writer(nilfs, sbi);
- err = nilfs_segctor_init(NILFS_SC(sbi));
+ err = nilfs_segctor_start_thread(NILFS_SC(sbi));
if (err) {
nilfs_detach_writer(nilfs, sbi);
kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a..dca1423 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
* @sc_write_logs: List of segment buffers to hold logs under writing
* @sc_segbuf_nblocks: Number of available blocks in segment buffers.
* @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
* @sc_stage: Collection stage
* @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
* @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
struct list_head sc_write_logs;
unsigned long sc_segbuf_nblocks;
struct nilfs_segment_buffer *sc_curseg;
- struct buffer_head *sc_super_root;
struct nilfs_cstage sc_stage;
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
unsigned long sc_lseg_stime; /* in 1/HZ seconds */
unsigned long sc_watermark;
- struct timer_list *sc_timer;
+ struct timer_list sc_timer;
struct task_struct *sc_task;
};
@@ -219,6 +217,8 @@ enum {
*/
#define NILFS_SC_DEFAULT_WATERMARK 3600
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
/* segment.c */
extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f5..03b34b7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
"(NILFS)");
MODULE_LICENSE("GPL");
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
+
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
/**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
va_end(args);
}
-static struct kmem_cache *nilfs_inode_cachep;
struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
{
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
}
-static void init_once(void *obj)
-{
- struct nilfs_inode_info *ii = obj;
-
- INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
- init_rwsem(&ii->xattr_sem);
-#endif
- nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
- ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
- inode_init_once(&ii->vfs_inode);
-}
-
-static int nilfs_init_inode_cache(void)
-{
- nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
- sizeof(struct nilfs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT,
- init_once);
-
- return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-
-static inline void nilfs_destroy_inode_cache(void)
-{
- kmem_cache_destroy(nilfs_inode_cachep);
-}
-
static void nilfs_clear_inode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
int err;
/* nilfs->sem must be locked by the caller. */
- if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
- if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+ if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+ if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
nilfs_swap_super_block(nilfs);
else {
printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (nilfs_test_opt(sbi, SNAPSHOT))
seq_printf(seq, ",cp=%llu",
(unsigned long long int)sbi->s_snapshot_cno);
- if (nilfs_test_opt(sbi, ERRORS_RO))
- seq_printf(seq, ",errors=remount-ro");
if (nilfs_test_opt(sbi, ERRORS_PANIC))
seq_printf(seq, ",errors=panic");
+ if (nilfs_test_opt(sbi, ERRORS_CONT))
+ seq_printf(seq, ",errors=continue");
if (nilfs_test_opt(sbi, STRICT_ORDER))
seq_printf(seq, ",order=strict");
if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
struct nilfs_super_block *sbp)
{
sbi->s_mount_opt =
- NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+ NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
}
static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -778,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
goto failed_sbi;
}
cno = sbi->s_snapshot_cno;
- } else
- /* Read-only mount */
- sbi->s_snapshot_cno = cno;
+ }
}
err = nilfs_attach_checkpoint(sbi, cno);
@@ -849,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
struct the_nilfs *nilfs = sbi->s_nilfs;
unsigned long old_sb_flags;
struct nilfs_mount_options old_opts;
- int err;
+ int was_snapshot, err;
lock_kernel();
@@ -857,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
old_sb_flags = sb->s_flags;
old_opts.mount_opt = sbi->s_mount_opt;
old_opts.snapshot_cno = sbi->s_snapshot_cno;
+ was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
if (!parse_options(data, sb)) {
err = -EINVAL;
@@ -864,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
}
sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
- if ((*flags & MS_RDONLY) &&
- sbi->s_snapshot_cno != old_opts.snapshot_cno) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount to a different snapshot.\n",
- sb->s_id);
- err = -EINVAL;
- goto restore_opts;
+ err = -EINVAL;
+ if (was_snapshot) {
+ if (!(*flags & MS_RDONLY)) {
+ printk(KERN_ERR "NILFS (device %s): cannot remount "
+ "snapshot read/write.\n",
+ sb->s_id);
+ goto restore_opts;
+ } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+ printk(KERN_ERR "NILFS (device %s): cannot "
+ "remount to a different snapshot.\n",
+ sb->s_id);
+ goto restore_opts;
+ }
+ } else {
+ if (nilfs_test_opt(sbi, SNAPSHOT)) {
+ printk(KERN_ERR "NILFS (device %s): cannot change "
+ "a regular mount to a snapshot.\n",
+ sb->s_id);
+ goto restore_opts;
+ }
}
if (!nilfs_valid_fs(nilfs)) {
printk(KERN_WARNING "NILFS (device %s): couldn't "
"remount because the filesystem is in an "
"incomplete recovery state.\n", sb->s_id);
- err = -EINVAL;
goto restore_opts;
}
@@ -888,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
nilfs_detach_segment_constructor(sbi);
sb->s_flags |= MS_RDONLY;
- sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
- /* nilfs_set_opt(sbi, SNAPSHOT); */
-
/*
* Remounting a valid RW partition RDONLY, so set
* the RDONLY flag and then mark the partition as valid again.
@@ -909,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
* store the current valid flag. (It may have been changed
* by fsck since we originally mounted the partition.)
*/
- if (nilfs->ns_current && nilfs->ns_current != sbi) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount because an RW-mount exists.\n",
- sb->s_id);
- err = -EBUSY;
- goto restore_opts;
- }
- if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount because the current RO-mount is not "
- "the latest one.\n",
- sb->s_id);
- err = -EINVAL;
- goto restore_opts;
- }
sb->s_flags &= ~MS_RDONLY;
- nilfs_clear_opt(sbi, SNAPSHOT);
- sbi->s_snapshot_cno = 0;
err = nilfs_attach_segment_constructor(sbi);
if (err)
@@ -935,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
down_write(&nilfs->ns_sem);
nilfs_setup_super(sbi);
up_write(&nilfs->ns_sem);
-
- nilfs->ns_current = sbi;
}
out:
up_write(&nilfs->ns_super_sem);
@@ -1022,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
{
struct nilfs_super_data sd;
struct super_block *s;
+ fmode_t mode = FMODE_READ;
struct the_nilfs *nilfs;
int err, need_to_close = 1;
- sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+ if (!(flags & MS_RDONLY))
+ mode |= FMODE_WRITE;
+
+ sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
if (IS_ERR(sd.bdev))
return PTR_ERR(sd.bdev);
@@ -1092,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
/* New superblock instance created */
s->s_flags = flags;
+ s->s_mode = mode;
strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
sb_set_blocksize(s, block_size(sd.bdev));
- err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+ err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+ nilfs);
if (err)
goto cancel_new;
@@ -1106,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
mutex_unlock(&nilfs->ns_mount_mutex);
put_nilfs(nilfs);
if (need_to_close)
- close_bdev_exclusive(sd.bdev, flags);
+ close_bdev_exclusive(sd.bdev, mode);
simple_set_mnt(mnt, s);
return 0;
@@ -1114,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
mutex_unlock(&nilfs->ns_mount_mutex);
put_nilfs(nilfs);
failed:
- close_bdev_exclusive(sd.bdev, flags);
+ close_bdev_exclusive(sd.bdev, mode);
return err;
@@ -1124,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
put_nilfs(nilfs);
deactivate_locked_super(s);
/*
- * deactivate_super() invokes close_bdev_exclusive().
+ * deactivate_locked_super() invokes close_bdev_exclusive().
* We must finish all post-cleaning before this call;
* put_nilfs() needs the block device.
*/
@@ -1139,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
{
- int err;
-
- err = nilfs_init_inode_cache();
- if (err)
- goto failed;
+ struct nilfs_inode_info *ii = obj;
- err = nilfs_init_transaction_cache();
- if (err)
- goto failed_inode_cache;
+ INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+ init_rwsem(&ii->xattr_sem);
+#endif
+ nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+ ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+ inode_init_once(&ii->vfs_inode);
+}
- err = nilfs_init_segbuf_cache();
- if (err)
- goto failed_transaction_cache;
+static void nilfs_segbuf_init_once(void *obj)
+{
+ memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
- err = nilfs_btree_path_cache_init();
- if (err)
- goto failed_segbuf_cache;
+static void nilfs_destroy_cachep(void)
+{
+ if (nilfs_inode_cachep)
+ kmem_cache_destroy(nilfs_inode_cachep);
+ if (nilfs_transaction_cachep)
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ if (nilfs_segbuf_cachep)
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ if (nilfs_btree_path_cache)
+ kmem_cache_destroy(nilfs_btree_path_cache);
+}
- err = register_filesystem(&nilfs_fs_type);
- if (err)
- goto failed_btree_path_cache;
+static int __init nilfs_init_cachep(void)
+{
+ nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+ sizeof(struct nilfs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ if (!nilfs_inode_cachep)
+ goto fail;
+
+ nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+ sizeof(struct nilfs_transaction_info), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (!nilfs_transaction_cachep)
+ goto fail;
+
+ nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+ sizeof(struct nilfs_segment_buffer), 0,
+ SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+ if (!nilfs_segbuf_cachep)
+ goto fail;
+
+ nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+ sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+ 0, 0, NULL);
+ if (!nilfs_btree_path_cache)
+ goto fail;
return 0;
- failed_btree_path_cache:
- nilfs_btree_path_cache_destroy();
+fail:
+ nilfs_destroy_cachep();
+ return -ENOMEM;
+}
+
+static int __init init_nilfs_fs(void)
+{
+ int err;
- failed_segbuf_cache:
- nilfs_destroy_segbuf_cache();
+ err = nilfs_init_cachep();
+ if (err)
+ goto fail;
- failed_transaction_cache:
- nilfs_destroy_transaction_cache();
+ err = register_filesystem(&nilfs_fs_type);
+ if (err)
+ goto free_cachep;
- failed_inode_cache:
- nilfs_destroy_inode_cache();
+ printk(KERN_INFO "NILFS version 2 loaded\n");
+ return 0;
- failed:
+free_cachep:
+ nilfs_destroy_cachep();
+fail:
return err;
}
static void __exit exit_nilfs_fs(void)
{
- nilfs_destroy_segbuf_cache();
- nilfs_destroy_transaction_cache();
- nilfs_destroy_inode_cache();
- nilfs_btree_path_cache_destroy();
+ nilfs_destroy_cachep();
unregister_filesystem(&nilfs_fs_type);
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7..a756168 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
printk(KERN_WARNING
"NILFS warning: unable to read secondary superblock\n");
+ /*
+ * Compare two super blocks and set 1 in swp if the secondary
+ * super block is valid and newer. Otherwise, set 0 in swp.
+ */
valid[0] = nilfs_valid_sb(sbp[0]);
valid[1] = nilfs_valid_sb(sbp[1]);
- swp = valid[1] &&
- (!valid[0] ||
- le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+ swp = valid[1] && (!valid[0] ||
+ le64_to_cpu(sbp[1]->s_last_cno) >
+ le64_to_cpu(sbp[0]->s_last_cno));
if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
brelse(sbh[1]);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c088..07d9fd8 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
mmap.o \
namei.o \
refcounttree.o \
+ reservations.o \
resize.o \
slot_map.o \
suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd91..215e12c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
int count, status, i;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
struct ocfs2_super *osb =
OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
count = 0;
while (count < wanted) {
- status = ocfs2_claim_metadata(osb,
- handle,
+ status = ocfs2_claim_metadata(handle,
meta_ac,
wanted - count,
+ &suballoc_loc,
&suballoc_bit_start,
&num_got,
&first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
eb->h_suballoc_slot =
cpu_to_le16(meta_ac->ac_alloc_slot);
+ eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
/* We'll also be dirtied by the caller, so
* this isn't absolutely necessary. */
- status = ocfs2_journal_dirty(handle, bhs[i]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[i]);
}
count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
goto out;
}
- status = ocfs2_extend_trans(handle, path_num_items(path) +
- handle->h_buffer_credits);
+ status = ocfs2_extend_trans(handle, path_num_items(path));
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
+ ocfs2_journal_dirty(handle, bh);
next_blkno = le64_to_cpu(eb->h_blkno);
}
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, *last_eb_bh);
- if (status < 0)
- mlog_errno(status);
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0)
- mlog_errno(status);
- if (eb_bh) {
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0)
- mlog_errno(status);
- }
+ ocfs2_journal_dirty(handle, *last_eb_bh);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (eb_bh)
+ ocfs2_journal_dirty(handle, eb_bh);
/*
* Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
eb_el->l_recs[i] = root_el->l_recs[i];
- status = ocfs2_journal_dirty(handle, new_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_eb_bh);
status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
if (root_el->l_tree_depth == cpu_to_le16(1))
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
*ret_new_eb_bh = new_eb_bh;
new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
struct ocfs2_path *right_path,
int subtree_index)
{
- int ret, i, idx;
+ int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
right_el);
- ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
-
- ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+ ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
/*
* Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
root_bh = left_path->p_node[subtree_index].bh;
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
}
static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
ocfs2_create_empty_extent(right_el);
- ret = ocfs2_journal_dirty(handle, right_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, right_leaf_bh);
/* Do the copy now. */
i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&left_el->l_next_free_rec, 1);
- ret = ocfs2_journal_dirty(handle, left_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, left_leaf_bh);
ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
@@ -2249,8 +2210,8 @@ out:
*
* Will return zero if the path passed in is already the leftmost path.
*/
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int op_credits,
struct ocfs2_path *path)
{
- int ret;
+ int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits) {
+ if (handle->h_buffer_credits < credits)
ret = ocfs2_extend_trans(handle,
credits - handle->h_buffer_credits);
- if (ret)
- return ret;
- if (unlikely(handle->h_buffer_credits < credits))
- return ocfs2_extend_trans(handle, credits);
- }
-
- return 0;
+ return ret;
}
/*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
* records for all the bh in the path.
* So we have to allocate extra credits and access them.
*/
- ret = ocfs2_extend_trans(handle,
- handle->h_buffer_credits + subtree_index);
+ ret = ocfs2_extend_trans(handle, subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
ocfs2_remove_empty_extent(right_leaf_el);
}
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
if (del_right_subtree) {
ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
if (right_has_empty)
ocfs2_remove_empty_extent(left_leaf_el);
- ret = ocfs2_journal_dirty(handle, et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et_root_bh);
*deleted = 1;
} else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
}
ocfs2_remove_empty_extent(el);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out:
return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (right_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
}
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (left_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
/*
* In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
le32_add_cpu(&rec->e_int_clusters,
-le32_to_cpu(rec->e_cpos));
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
}
}
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
struct buffer_head *leaf_bh = path_leaf_bh(right_path);
if (left_path) {
- int credits = handle->h_buffer_credits;
-
/*
* There's a chance that left_path got passed back to
* us without being accounted for in the
* journal. Extend our transaction here to be sure we
* can change those blocks.
*/
- credits += left_path->p_tree_depth;
-
- ret = ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
* dirty this for us.
*/
if (left_path)
- ret = ocfs2_journal_dirty(handle,
- path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle,
+ path_leaf_bh(left_path));
} else
ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
insert);
- ret = ocfs2_journal_dirty(handle, leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, leaf_bh);
if (left_path) {
/*
@@ -4384,9 +4307,7 @@ out_update_clusters:
ocfs2_et_update_clusters(et,
le16_to_cpu(insert_rec->e_leaf_clusters));
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
out:
ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
goto leave;
}
- status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ status = __ocfs2_claim_clusters(handle, data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (status < 0) {
if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
goto leave;
}
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
clusters_to_add -= num_bits;
*logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
- int ret, depth, credits = handle->h_buffer_credits;
+ int ret, depth, credits;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
} else
rightmost_el = path_leaf_el(path);
- credits += path->p_tree_depth +
- ocfs2_extend_meta_needed(et->et_root_el);
+ credits = path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
return ret;
}
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **ac,
+ int extra_blocks)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *ac = NULL;
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ if (extra_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ if (*ac) {
+ ocfs2_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc)
{
- int ret;
+ int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
handle_t *handle;
struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+
+ if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+ extra_blocks);
if (ret) {
mlog_errno(ret);
return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb) + credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
ocfs2_et_update_clusters(et, -len);
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
- if (ret)
- mlog_errno(ret);
+ if (phys_blkno) {
+ if (flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ phys_blkno),
+ len, meta_ac,
+ dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+ }
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
return ret;
}
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
}
tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
bail:
mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
tl->tl_used = cpu_to_le16(i);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
/* TODO: Perhaps we can calculate the bulk of the
* credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
*/
struct ocfs2_cached_block_free {
struct ocfs2_cached_block_free *free_next;
+ u64 free_bg;
u64 free_blk;
unsigned int free_bit;
};
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
}
while (head) {
- bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
- head->free_bit);
+ if (head->free_bg)
+ bg_blkno = head->free_bg;
+ else
+ bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+ head->free_bit);
mlog(0, "Free bit: (bit %u, blkno %llu)\n",
head->free_bit, (unsigned long long)head->free_blk);
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int ret = 0;
struct ocfs2_cached_block_free *item;
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
}
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
- unsigned int bit)
+ int type, int slot, u64 suballoc,
+ u64 blkno, unsigned int bit)
{
int ret;
struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
goto out;
}
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
type, slot, bit, (unsigned long long)blkno);
+ item->free_bg = suballoc;
item->free_blk = blkno;
item->free_bit = bit;
item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
{
return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(eb->h_suballoc_slot),
+ le64_to_cpu(eb->h_suballoc_loc),
le64_to_cpu(eb->h_blkno),
le16_to_cpu(eb->h_suballoc_bit));
}
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
- unsigned int clusters_to_del,
- struct ocfs2_path *path,
- struct buffer_head **new_last_eb)
-{
- int next_free, ret = 0;
- u32 cpos;
- struct ocfs2_extent_rec *rec;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
- struct buffer_head *bh = NULL;
-
- *new_last_eb = NULL;
-
- /* we have no tree, so of course, no last_eb. */
- if (!path->p_tree_depth)
- goto out;
-
- /* trunc to zero special case - this makes tree_depth = 0
- * regardless of what it is. */
- if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
- goto out;
-
- el = path_leaf_el(path);
- BUG_ON(!el->l_next_free_rec);
-
- /*
- * Make sure that this extent list will actually be empty
- * after we clear away the data. We can shortcut out if
- * there's more than one non-empty extent in the
- * list. Otherwise, a check of the remaining extent is
- * necessary.
- */
- next_free = le16_to_cpu(el->l_next_free_rec);
- rec = NULL;
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- if (next_free > 2)
- goto out;
-
- /* We may have a valid extent in index 1, check it. */
- if (next_free == 2)
- rec = &el->l_recs[1];
-
- /*
- * Fall through - no more nonempty extents, so we want
- * to delete this leaf.
- */
- } else {
- if (next_free > 1)
- goto out;
-
- rec = &el->l_recs[0];
- }
-
- if (rec) {
- /*
- * Check it we'll only be trimming off the end of this
- * cluster.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
- goto out;
- }
-
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- eb = (struct ocfs2_extent_block *) bh->b_data;
- el = &eb->h_list;
-
- /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
- * Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-
- *new_last_eb = bh;
- get_bh(*new_last_eb);
- mlog(0, "returning block %llu, (cpos: %u)\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
- brelse(bh);
-
- return ret;
-}
-
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- * - start journaling of each path component.
- * - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
- handle_t *handle, struct ocfs2_truncate_context *tc,
- u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
- int ret, i, index = path->p_tree_depth;
- u32 new_edge = 0;
- u64 deleted_eb = 0;
- struct buffer_head *bh;
- struct ocfs2_extent_list *el;
- struct ocfs2_extent_rec *rec;
-
- *delete_start = 0;
- *flags = 0;
-
- while (index >= 0) {
- bh = path->p_node[index].bh;
- el = path->p_node[index].el;
-
- mlog(0, "traveling tree (index = %d, block = %llu)\n",
- index, (unsigned long long)bh->b_blocknr);
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-
- if (index !=
- (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has invalid ext. block %llu",
- inode->i_ino,
- (unsigned long long)bh->b_blocknr);
- ret = -EROFS;
- goto out;
- }
-
-find_tail_record:
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- rec = &el->l_recs[i];
-
- mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
- "next = %u\n", i, le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-
- if (le16_to_cpu(el->l_tree_depth) == 0) {
- /*
- * If the leaf block contains a single empty
- * extent and no records, we can just remove
- * the block.
- */
- if (i == 0 && ocfs2_is_empty_extent(rec)) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- el->l_next_free_rec = cpu_to_le16(0);
-
- goto delete;
- }
-
- /*
- * Remove any empty extents by shifting things
- * left. That should make life much easier on
- * the code below. This condition is rare
- * enough that we shouldn't see a performance
- * hit.
- */
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- for(i = 0;
- i < le16_to_cpu(el->l_next_free_rec); i++)
- el->l_recs[i] = el->l_recs[i + 1];
-
- memset(&el->l_recs[i], 0,
- sizeof(struct ocfs2_extent_rec));
-
- /*
- * We've modified our extent list. The
- * simplest way to handle this change
- * is to being the search from the
- * start again.
- */
- goto find_tail_record;
- }
-
- le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-
- /*
- * We'll use "new_edge" on our way back up the
- * tree to know what our rightmost cpos is.
- */
- new_edge = le16_to_cpu(rec->e_leaf_clusters);
- new_edge += le32_to_cpu(rec->e_cpos);
-
- /*
- * The caller will use this to delete data blocks.
- */
- *delete_start = le64_to_cpu(rec->e_blkno)
- + ocfs2_clusters_to_blocks(inode->i_sb,
- le16_to_cpu(rec->e_leaf_clusters));
- *flags = rec->e_flags;
-
- /*
- * If it's now empty, remove this record.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- } else {
- if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- goto delete;
- }
-
- /* Can this actually happen? */
- if (le16_to_cpu(el->l_next_free_rec) == 0)
- goto delete;
-
- /*
- * We never actually deleted any clusters
- * because our leaf was empty. There's no
- * reason to adjust the rightmost edge then.
- */
- if (new_edge == 0)
- goto delete;
-
- rec->e_int_clusters = cpu_to_le32(new_edge);
- le32_add_cpu(&rec->e_int_clusters,
- -le32_to_cpu(rec->e_cpos));
-
- /*
- * A deleted child record should have been
- * caught above.
- */
- BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
- }
-
-delete:
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- mlog(0, "extent list container %llu, after: record %d: "
- "(%u, %u, %llu), next = %u.\n",
- (unsigned long long)bh->b_blocknr, i,
- le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- /*
- * We must be careful to only attempt delete of an
- * extent block (and not the root inode block).
- */
- if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
- struct ocfs2_extent_block *eb =
- (struct ocfs2_extent_block *)bh->b_data;
-
- /*
- * Save this for use when processing the
- * parent block.
- */
- deleted_eb = le64_to_cpu(eb->h_blkno);
-
- mlog(0, "deleting this extent block.\n");
-
- ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-
- BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
- BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
- BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-
- ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
- /* An error here is not fatal. */
- if (ret < 0)
- mlog_errno(ret);
- } else {
- deleted_eb = 0;
- }
-
- index--;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
- unsigned int clusters_to_del,
- struct inode *inode,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_truncate_context *tc,
- struct ocfs2_path *path,
- struct ocfs2_alloc_context *meta_ac)
-{
- int status;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *last_eb = NULL;
- struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh = NULL;
- u64 delete_blk = 0;
- u8 rec_flags;
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
- path, &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- /*
- * Each component will be touched, so we might as well journal
- * here to avoid having to handle errors later.
- */
- status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb_bh) {
- status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- el = &(fe->id2.i_list);
-
- /*
- * Lower levels depend on this never happening, but it's best
- * to check it up here before changing the tree.
- */
- if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has an empty extent record, depth %u\n",
- inode->i_ino, le16_to_cpu(el->l_tree_depth));
- status = -EROFS;
- goto bail;
- }
-
- dquot_free_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
- clusters_to_del;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- inode->i_blocks = ocfs2_inode_sector_count(inode);
-
- status = ocfs2_trim_tree(inode, path, handle, tc,
- clusters_to_del, &delete_blk, &rec_flags);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- if (le32_to_cpu(fe->i_clusters) == 0) {
- /* trunc to zero is a special case. */
- el->l_tree_depth = 0;
- fe->i_last_eb_blk = 0;
- } else if (last_eb)
- fe->i_last_eb_blk = last_eb->h_blkno;
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb) {
- /* If there will be a new last extent block, then by
- * definition, there cannot be any leaves to the right of
- * him. */
- last_eb->h_next_leaf_blk = 0;
- status = ocfs2_journal_dirty(handle, last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- if (delete_blk) {
- if (rec_flags & OCFS2_EXT_REFCOUNTED)
- status = ocfs2_decrease_refcount(inode, handle,
- ocfs2_blocks_to_clusters(osb->sb,
- delete_blk),
- clusters_to_del, meta_ac,
- &tc->tc_dealloc, 1);
- else
- status = ocfs2_truncate_log_append(osb, handle,
- delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
- status = 0;
-bail:
- brelse(last_eb_bh);
- mlog_exit(status);
- return status;
-}
-
static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_commit;
did_quota = 1;
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
if (ret) {
mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
*/
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc)
+ struct buffer_head *di_bh)
{
- int status, i, credits, tl_sem = 0;
- u32 clusters_to_del, new_highest_cpos, range;
+ int status = 0, i, flags = 0;
+ u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
u64 blkno = 0;
struct ocfs2_extent_list *el;
- handle_t *handle = NULL;
- struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_extent_rec *rec;
struct ocfs2_path *path = NULL;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
- struct ocfs2_alloc_context *meta_ac = NULL;
- struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+ u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+ struct ocfs2_extent_tree et;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
mlog_entry_void();
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+ path = ocfs2_new_path(di_bh, &di->id2.i_list,
ocfs2_journal_access_di);
if (!path) {
status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
goto bail;
}
- credits = 0;
-
/*
* Truncate always works against the rightmost tree branch.
*/
@@ -7480,101 +7064,62 @@ start:
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
- range = le32_to_cpu(el->l_recs[i].e_cpos) +
- ocfs2_rec_clusters(el, &el->l_recs[i]);
- if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
- clusters_to_del = 0;
- } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
- clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
- blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+ rec = &el->l_recs[i];
+ flags = rec->e_flags;
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ /*
+ * Lower levels depend on this never happening, but it's best
+ * to check it up here before changing the tree.
+ */
+ if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+ ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ "extent record, depth %u\n", inode->i_ino,
+ le16_to_cpu(root_el->l_tree_depth));
+ status = -EROFS;
+ goto bail;
+ }
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
+ } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+ /*
+ * Truncate entire record.
+ */
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = ocfs2_rec_clusters(el, rec);
+ blkno = le64_to_cpu(rec->e_blkno);
} else if (range > new_highest_cpos) {
- clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
- le32_to_cpu(el->l_recs[i].e_cpos)) -
- new_highest_cpos;
- blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
- ocfs2_clusters_to_blocks(inode->i_sb,
- ocfs2_rec_clusters(el, &el->l_recs[i]) -
- clusters_to_del);
+ /*
+ * Partial truncate. it also should be
+ * the last truncate we're doing.
+ */
+ trunc_cpos = new_highest_cpos;
+ trunc_len = range - new_highest_cpos;
+ coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+ blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
} else {
+ /*
+ * Truncate completed, leave happily.
+ */
status = 0;
goto bail;
}
- mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
- clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-
- if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
-
- status = ocfs2_lock_refcount_tree(osb,
- le64_to_cpu(di->i_refcount_loc),
- 1, &ref_tree, NULL);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
- blkno,
- clusters_to_del,
- &credits,
- &meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- mutex_lock(&tl_inode->i_mutex);
- tl_sem = 1;
- /* ocfs2_truncate_log_needs_flush guarantees us at least one
- * record is free for use. If there isn't any, we flush to get
- * an empty truncate log. */
- if (ocfs2_truncate_log_needs_flush(osb)) {
- status = __ocfs2_flush_truncate_log(osb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
- (struct ocfs2_dinode *)fe_bh->b_data,
- el);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
- tc, path, meta_ac);
+ status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags, &dealloc,
+ refcount_loc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mutex_unlock(&tl_inode->i_mutex);
- tl_sem = 0;
-
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
-
ocfs2_reinit_path(path, 1);
- if (meta_ac) {
- ocfs2_free_alloc_context(meta_ac);
- meta_ac = NULL;
- }
-
- if (ref_tree) {
- ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
- ref_tree = NULL;
- }
-
/*
* The check above will catch the case where we've truncated
* away all allocation.
@@ -7585,25 +7130,10 @@ bail:
ocfs2_schedule_truncate_log_flush(osb, 1);
- if (tl_sem)
- mutex_unlock(&tl_inode->i_mutex);
-
- if (handle)
- ocfs2_commit_trans(osb, handle);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
- if (ref_tree)
- ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
- ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+ ocfs2_run_deallocs(osb, &dealloc);
ocfs2_free_path(path);
- /* This will drop the ext_alloc cluster lock for us */
- ocfs2_free_truncate_context(tc);
-
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359..55762b5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc);
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
u64 blkno, unsigned int bit);
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
+ int type, int slot, u64 suballoc, u64 blkno,
unsigned int bit);
static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
{
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc);
+ struct buffer_head *di_bh);
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
unsigned int start, unsigned int end, int trunc);
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
struct ocfs2_path *path);
int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
struct ocfs2_path *left,
struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441dd..3623ca2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
goto out;
}
+ if (data_ac)
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list,
clusters_to_alloc);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a..c7fba39 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
+ define_mask(RESERVATIONS),
};
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbe..fd96e2a 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743e..aa75ca3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
o2net_sc_queue_work(sc, &sc->sc_connect_work);
break;
default:
+ printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
break;
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d0..f04ebcf 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
else
de->inode = 0;
dir->i_version++;
- status = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
goto bail;
}
i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
ocfs2_recalc_free_list(dir, handle, lookup);
dir->i_version++;
- status = ocfs2_journal_dirty(handle, insert_bh);
+ ocfs2_journal_dirty(handle, insert_bh);
retval = 0;
goto bail;
}
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
}
ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
-
ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
i_size_write(inode, size);
inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_init_dir_trailer(inode, new_bh, size);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_bh);
i_size_write(inode, inode->i_sb->s_blocksize);
inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
int ret;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
u16 dr_suballoc_bit;
- u64 dr_blkno;
+ u64 suballoc_loc, dr_blkno;
unsigned int num_bits;
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dir_block_trailer *trailer =
ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
- &num_bits, &dr_blkno);
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &dr_suballoc_bit, &num_bits, &dr_blkno);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
memset(dx_root, 0, osb->sb->s_blocksize);
strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root->dr_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
}
-
- ret = ocfs2_journal_dirty(handle, dx_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, dx_root_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
*ret_dx_root_bh = dx_root_bh;
dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
* chance of contiguousness as the directory grows in number
* of entries.
*/
- ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+ ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* if we only get one now, that's enough to continue. The rest
* will be claimed after the conversion to extents.
*/
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &oi->ip_la_data_resv;
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_init_dir_trailer(dir, dirdata_bh, i);
}
- ret = ocfs2_journal_dirty(handle, dirdata_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, dirdata_bh);
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
/*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
*/
dir->i_blocks = ocfs2_inode_sector_count(dir);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, di_bh);
if (ocfs2_supports_indexed_dirs(osb)) {
ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* pass. Claim the 2nd cluster as a separate extent.
*/
if (alloc > len) {
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&len);
if (ret) {
mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
credits = ocfs2_calc_extend_credits(sb, el, 1);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
} else {
de->rec_len = cpu_to_le16(sb->s_blocksize);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
dx_leaf_sort_swap);
- ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, dx_leaf_bh);
ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
&split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
blk = le64_to_cpu(dx_root->dr_blkno);
bit = le16_to_cpu(dx_root->dr_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (dx_root->dr_suballoc_loc)
+ bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
bit, bg_blkno, 1);
if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
- ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
- &dealloc);
+ ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+ &dealloc, 0);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 12d5eb7..f449991 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
return 0;
}
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
}
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+ lock->ml.node);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be3..4b6ae2c 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
# define DLM_HASH_PAGES 1
#else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90803b4..9f30491 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+ res->owner);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c905..6b5a492 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
assert_spin_locked(&dlm->spinlock);
- printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+ printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
node = exit_msg->node_idx;
- printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
&leave_msg, sizeof(leave_msg), node,
NULL);
-
+ if (status < 0)
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
mlog(0, "status return %d from o2net_send_message\n", status);
return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
set_bit(assert->node_idx, dlm->domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
&cancel_msg, sizeof(cancel_msg), node,
NULL);
if (status < 0) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node,
- &join_resp);
+ sizeof(join_msg), node, &join_resp);
if (status < 0 && status != -ENOPROTOOPT) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
&assert_msg, sizeof(assert_msg), node,
NULL);
if (status < 0)
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+ node);
return status;
}
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
mlog_errno(-ENOMEM);
kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
for (i = 0; i < DLM_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
- strcpy(dlm->name, domain);
dlm->key = key;
dlm->node_num = o2nm_this_node();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 73333777..69cf369 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
BUG();
}
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+ res->owner);
if (dlm_is_host_down(tmpret)) {
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+ lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9289b43..4a7506a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res = NULL;
- res = (struct dlm_lock_resource *)
- kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+ res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
goto error;
- res->lockname.name = (char *)
- kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+ res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
if (!res->lockname.name)
goto error;
@@ -757,8 +755,7 @@ lookup:
spin_unlock(&dlm->spinlock);
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
- alloc_mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
- mlog(0, "assert_master returned %d!\n", tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", tmpret,
+ DLM_ASSERT_MASTER_MSG, dlm->key, to);
if (!dlm_is_host_down(tmpret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
@@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+ res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
goto leave;
}
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
@@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(0, "migrate_request returned %d!\n", ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+ dlm->key, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de..f8b75ce 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
/* negative status is handled by caller */
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+ dlm->key, request_from);
// return from here, then
// sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+ dlm->key, send_to);
if (!dlm_is_host_down(ret)) {
- mlog_errno(ret);
- mlog(ML_ERROR, "%s: unknown error sending data-done "
- "to %u\n", dlm->name, send_to);
BUG();
}
} else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+ dlm->key, send_to);
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
&req, sizeof(req), nodenum, &status);
/* XXX: negative status not handled properly here. */
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+ dlm->key, nodenum);
else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
if (dlm_is_host_down(ret)) {
/* node is down. not involved in recovery
* so just keep going */
- mlog(0, "%s: node %u was down when sending "
+ mlog(ML_NOTICE, "%s: node %u was down when sending "
"begin reco msg (%d)\n", dlm->name, nodenum, ret);
ret = 0;
}
@@ -2660,11 +2666,12 @@ retry:
}
if (ret < 0) {
struct dlm_lock_resource *res;
+
/* this is now a serious problem, possibly ENOMEM
* in the network stack. must retry */
mlog_errno(ret);
mlog(ML_ERROR, "begin reco of dlm %s to node %u "
- " returned %d\n", dlm->name, nodenum, ret);
+ "returned %d\n", dlm->name, nodenum, ret);
res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
DLM_RECOVERY_LOCK_NAME_LEN);
if (res) {
@@ -2789,7 +2796,9 @@ stage2:
if (ret >= 0)
ret = status;
if (ret < 0) {
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+ dlm->key, nodenum);
if (dlm_is_host_down(ret)) {
/* this has no effect on this recovery
* session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1f..d4f73ca 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
* basts right before queueing them all throughout */
+ assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
@@ -337,7 +338,7 @@ converting:
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +356,7 @@ converting:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
@@ -383,7 +384,7 @@ converting:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -402,7 +403,7 @@ blocked:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +419,7 @@ blocked:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +445,7 @@ blocked:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -674,6 +675,7 @@ static int dlm_thread(void *data)
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
+ spin_lock(&dlm->ast_lock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
@@ -694,6 +696,7 @@ static int dlm_thread(void *data)
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->ast_lock);
mlog(0, "delaying list shuffling for in-"
"progress lockres %.*s, state=%d\n",
res->lockname.len, res->lockname.name,
@@ -715,6 +718,7 @@ static int dlm_thread(void *data)
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->ast_lock);
dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b47c1b9..817287c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5fbd9c..f74f140 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
inode->i_atime = CURRENT_TIME;
di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, fe_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
int status = 0;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_truncate_context *tc = NULL;
mlog_entry("(inode = %llu, new_i_size = %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_resv_discard(&osb->osb_la_resmap,
+ &OCFS2_I(inode)->ip_la_data_resv);
+
/*
* The inode lock forced other nodes to sync and drop their
* pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
- status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail_unlock_sem;
- }
-
- status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
goto leave;
}
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -1195,9 +1182,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1419,90 @@ out:
return ret;
}
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+ int i;
+ struct ocfs2_extent_rec *rec = NULL;
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) < pos)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec,
+ u32 trunc_start, u32 *trunc_cpos,
+ u32 *trunc_len, u32 *trunc_end,
+ u64 *blkno, int *done)
+{
+ int ret = 0;
+ u32 coff, range;
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+ *trunc_cpos = le32_to_cpu(rec->e_cpos);
+ /*
+ * Skip holes if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno);
+ *trunc_end = le32_to_cpu(rec->e_cpos);
+ } else if (range > trunc_start) {
+ *trunc_cpos = trunc_start;
+ *trunc_len = *trunc_end - trunc_start;
+ coff = trunc_start - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
+ *trunc_end = trunc_start;
+ } else {
+ /*
+ * It may have two following possibilities:
+ *
+ * - last record has been removed
+ * - trunc_start was within a hole
+ *
+ * both two cases mean the completion of hole punching.
+ */
+ ret = 1;
+ }
+
+ *done = ret;
+}
+
static int ocfs2_remove_inode_range(struct inode *inode,
struct buffer_head *di_bh, u64 byte_start,
u64 byte_len)
{
- int ret = 0;
- u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+ int ret = 0, flags = 0, done = 0, i;
+ u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+ u32 cluster_in_el;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
struct address_space *mapping = inode->i_mapping;
struct ocfs2_extent_tree et;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el = NULL;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1528,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
+ /*
+ * For reflinks, we may need to CoW 2 clusters which might be
+ * partially zero'd later, if hole's start and end offset were
+ * within one cluster(means is not exactly aligned to clustersize).
+ */
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
- trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
- if (trunc_len >= trunc_start)
- trunc_len -= trunc_start;
- else
- trunc_len = 0;
+ trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+ cluster_in_el = trunc_end;
- mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)byte_start,
- (unsigned long long)byte_len, trunc_start, trunc_len);
+ (unsigned long long)byte_len, trunc_start, trunc_end);
ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
if (ret) {
@@ -1487,31 +1564,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- cpos = trunc_start;
- while (trunc_len) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
- &alloc_size, NULL);
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (trunc_end > trunc_start) {
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path,
+ cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
- if (alloc_size > trunc_len)
- alloc_size = trunc_len;
+ el = path_leaf_el(path);
- /* Only do work for non-holes */
- if (phys_cpos != 0) {
- ret = ocfs2_remove_btree_range(inode, &et, cpos,
- phys_cpos, alloc_size,
- &dealloc);
+ i = ocfs2_find_rec(el, trunc_end);
+ /*
+ * Need to go to previous extent block.
+ */
+ if (i < 0) {
+ if (path->p_tree_depth == 0)
+ break;
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ path,
+ &cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
+
+ /*
+ * We've reached the leftmost extent block,
+ * it's safe to leave.
+ */
+ if (cluster_in_el == 0)
+ break;
+
+ /*
+ * The 'pos' searched for previous extent block is
+ * always one cluster less than actual trunc_end.
+ */
+ trunc_end = cluster_in_el + 1;
+
+ ocfs2_reinit_path(path, 1);
+
+ continue;
+
+ } else
+ rec = &el->l_recs[i];
+
+ ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+ &trunc_len, &trunc_end, &blkno, &done);
+ if (done)
+ break;
+
+ flags = rec->e_flags;
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags,
+ &dealloc, refcount_loc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
- cpos += alloc_size;
- trunc_len -= alloc_size;
+ cluster_in_el = trunc_end;
+
+ ocfs2_reinit_path(path, 1);
}
ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index af18988..abb0a95 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_last_used_slot = 0;
OCFS2_I(inode)->ip_last_used_group = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+ OCFS2_RESV_FLAG_DIR);
mlog_exit_void();
}
@@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
- struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
handle_t *handle = NULL;
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle);
handle = NULL;
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
-
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
-
- status = ocfs2_journal_dirty(handle, di_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ ocfs2_journal_dirty(handle, di_bh);
ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
dquot_free_inode(inode);
@@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
void ocfs2_delete_inode(struct inode *inode)
{
int wipe, status;
- sigset_t blocked, oldset;
+ sigset_t oldset;
struct buffer_head *di_bh = NULL;
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
* forever. */
- sigfillset(&blocked);
- status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_cleanup_delete_inode(inode, 1);
- goto bail;
- }
+ ocfs2_block_signals(&oldset);
/*
* Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync:
ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
bail_unblock:
- status = sigprocmask(SIG_SETMASK, &oldset, NULL);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_unblock_signals(&oldset);
bail:
clear_inode(inode);
mlog_exit_void();
@@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+ ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ &oi->ip_la_data_resv);
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
* locks until the journal has finished with it. The only
@@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
-
- status = 0;
+ ocfs2_journal_dirty(handle, bh);
leave:
-
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0b28e19..9f5f5fc 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
+
+ struct ocfs2_alloc_reservation ip_la_data_resv;
};
/*
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60..47878cf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
}
/*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
*
* This might call jbd2_journal_restart() which will commit dirty buffers
* and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
*/
int ocfs2_extend_trans(handle_t *handle, int nblocks)
{
- int status;
+ int status, old_nblocks;
BUG_ON(!handle);
- BUG_ON(!nblocks);
+ BUG_ON(nblocks < 0);
+
+ if (!nblocks)
+ return 0;
+ old_nblocks = handle->h_buffer_credits;
mlog_entry_void();
mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
mlog(0,
"jbd2_journal_extend failed, trying "
"jbd2_journal_restart\n");
- status = jbd2_journal_restart(handle, nblocks);
+ status = jbd2_journal_restart(handle,
+ old_nblocks + nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
return __ocfs2_journal_access(handle, ci, bh, NULL, type);
}
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
{
int status;
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
(unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- if (status < 0)
- mlog(ML_ERROR, "Could not dirty metadata buffer. "
- "(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
+ BUG_ON(status);
- mlog_exit(status);
- return status;
+ mlog_exit_void();
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09..b5baaa8 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
/*
* Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
return blocks;
}
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list. The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits(). They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+ return ocfs2_extent_recs_per_gd(sb);
+}
+
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c983715..3d74196 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
- u32 numbits);
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv);
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ * group descriptors for other allocations (such as block groups,
+ * etc). Picking default sizes which are a multiple of 4 could help
+ * - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ * file systems. This can easily be taken care of by limiting our
+ * default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ * particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K group: 126M la: 121M
+ * csize: 8K group: 252M la: 243M
+ * csize: 16K group: 504M la: 486M
+ * csize: 32K group: 1008M la: 972M
+ * csize: 64K group: 2016M la: 1944M
+ * csize: 128K group: 4032M la: 3888M
+ * csize: 256K group: 8064M la: 7776M
+ * csize: 512K group: 16128M la: 15552M
+ * csize: 1024K group: 32256M la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT 8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+ unsigned int la_mb;
+ unsigned int gd_mb;
+ unsigned int megs_per_slot;
+ struct super_block *sb = osb->sb;
+
+ gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+ /*
+ * This takes care of files systems with very small group
+ * descriptors - 512 byte blocksize at cluster sizes lower
+ * than 16K and also 1k blocksize with 4k cluster size.
+ */
+ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+ || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+ return OCFS2_LA_OLD_DEFAULT;
+
+ /*
+ * Leave enough room for some block groups and make the final
+ * value we work from a multiple of 4.
+ */
+ gd_mb -= 16;
+ gd_mb &= 0xFFFFFFFB;
+
+ la_mb = gd_mb;
+
+ /*
+ * Keep window sizes down to a reasonable default
+ */
+ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+ /*
+ * Some clustersize / blocksize combinations will have
+ * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+ * default size, but get poor distribution when
+ * limited to exactly 256 megabytes.
+ *
+ * As an example, 16K clustersize at 4K blocksize
+ * gives us a cluster group size of 504M. Paring the
+ * local alloc size down to 256 however, would give us
+ * only one window and around 200MB left in the
+ * cluster group. Instead, find the first size below
+ * 256 which would give us an even distribution.
+ *
+ * Larger cluster group sizes actually work out pretty
+ * well when pared to 256, so we don't have to do this
+ * for any group that fits more than two
+ * OCFS2_LA_MAX_DEFAULT_MB windows.
+ */
+ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+ la_mb = 256;
+ else {
+ unsigned int gd_mult = gd_mb;
+
+ while (gd_mult > 256)
+ gd_mult = gd_mult >> 1;
+
+ la_mb = gd_mult;
+ }
+ }
+
+ megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+ megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+ /* Too many nodes, too few disk clusters. */
+ if (megs_per_slot < la_mb)
+ la_mb = megs_per_slot;
+
+ return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+ struct super_block *sb = osb->sb;
+ unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+ unsigned int la_max_mb;
+
+ la_max_mb = ocfs2_clusters_to_megabytes(sb,
+ ocfs2_local_alloc_size(sb) * 8);
+
+ mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+ requested_mb, la_max_mb, la_default_mb);
+
+ if (requested_mb == -1) {
+ /* No user request - use defaults */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_default_mb);
+ } else if (requested_mb > la_max_mb) {
+ /* Request is too big, we give the maximum available */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_max_mb);
+ } else {
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, requested_mb);
+ }
+
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_bits, (osb->bitmap_cpg - 1));
osb->local_alloc_bits =
ocfs2_megabytes_to_clusters(osb->sb,
- OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+ ocfs2_la_default_mb(osb));
}
/* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_state = OCFS2_LA_DISABLED;
+ ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
return status;
}
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
- struct ocfs2_alloc_context *ac,
- u32 bits_wanted)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *alloc;
- struct ocfs2_local_alloc *la;
- int start;
- u64 block_off;
-
- if (!ac->ac_max_block)
- return 1;
-
- alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
- la = OCFS2_LOCAL_ALLOC(alloc);
-
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
- if (start == -1) {
- mlog_errno(-ENOSPC);
- return 0;
- }
-
- /*
- * Converting (bm_off + start + bits_wanted) to blocks gives us
- * the blkno just past our actual allocation. This is perfect
- * to compare with ac_max_block.
- */
- block_off = ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(la->la_bm_off) +
- start + bits_wanted);
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)block_off,
- (unsigned long long)ac->ac_max_block);
- if (block_off > ac->ac_max_block)
- return 0;
-
- return 1;
-}
-
/*
* make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mlog(0, "Calling in_range for max block %llu\n",
(unsigned long long)ac->ac_max_block);
- if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
- bits_wanted)) {
- /*
- * The window is outside ac->ac_max_block.
- * This errno tells the caller to keep localalloc enabled
- * but to get the allocation from the main bitmap.
- */
- status = -EFBIG;
- goto bail;
- }
-
ac->ac_inode = local_alloc_inode;
/* We should never use localalloc from another slot */
ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+ ac->ac_resv);
if (start == -1) {
/* TODO: Shouldn't we just BUG here? */
status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
bitmap = la->la_bitmap;
*bit_off = le32_to_cpu(la->la_bm_off) + start;
- /* local alloc is always contiguous by nature -- we never
- * delete bits from it! */
*num_bits = bits_wanted;
status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
+ ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+ bits_wanted);
+
while(bits_wanted--)
ocfs2_set_bit(start++, bitmap);
le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = 0;
bail:
mlog_exit(status);
return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
}
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
- struct ocfs2_dinode *alloc,
- u32 numbits)
+ struct ocfs2_dinode *alloc,
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv)
{
int numfound, bitoff, left, startoff, lastzero;
+ int local_resv = 0;
+ struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
+ struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
- mlog_entry("(numbits wanted = %u)\n", numbits);
+ mlog_entry("(numbits wanted = %u)\n", *numbits);
if (!alloc->id1.bitmap1.i_total) {
mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
goto bail;
}
+ if (!resv) {
+ local_resv = 1;
+ ocfs2_resv_init_once(&r);
+ ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+ resv = &r;
+ }
+
+ numfound = *numbits;
+ if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+ if (numfound < *numbits)
+ *numbits = numfound;
+ goto bail;
+ }
+
+ /*
+ * Code error. While reservations are enabled, local
+ * allocation should _always_ go through them.
+ */
+ BUG_ON(osb->osb_resv_level != 0);
+
+ /*
+ * Reservations are disabled. Handle this the old way.
+ */
+
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
startoff = bitoff+1;
}
/* we got everything we needed */
- if (numfound == numbits) {
+ if (numfound == *numbits) {
/* mlog(0, "Found it all!\n"); */
break;
}
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
numfound);
- if (numfound == numbits)
+ if (numfound == *numbits)
bitoff = startoff - numfound;
else
bitoff = -1;
bail:
+ if (local_resv)
+ ocfs2_resv_discard(resmap, resv);
+
mlog_exit(bitoff);
return bitoff;
}
@@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+ status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
if (status == -ENOSPC) {
retry_enospc:
@@ -1063,7 +1175,7 @@ retry_enospc:
goto bail;
ac->ac_bits_wanted = osb->local_alloc_default_bits;
- status = ocfs2_claim_clusters(osb, handle, ac,
+ status = ocfs2_claim_clusters(handle, ac,
osb->local_alloc_bits,
&cluster_off,
&cluster_count);
@@ -1098,6 +1210,9 @@ retry_enospc:
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
+ ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+ OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
mlog(0, "New window allocated:\n");
mlog(0, "window la_bm_off = %u\n",
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
@@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
atomic_inc(&osb->alloc_stats.moves);
- status = 0;
bail:
if (handle)
ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f8..1be9b58 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7898bd3..af2b8fe 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -41,44 +41,20 @@
#include "file.h"
#include "inode.h"
#include "mmap.h"
+#include "super.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
- /* The best way to deal with signals in the vm path is
- * to block them upfront, rather than allowing the
- * locking paths to return -ERESTARTSYS. */
- sigfillset(blocked);
-
- /* We should technically never get a bad return value
- * from sigprocmask */
- return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
- return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
- sigset_t blocked, oldset;
- int error, ret;
+ sigset_t oldset;
+ int ret;
mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
- error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (error < 0) {
- mlog_errno(error);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
+ ocfs2_block_signals(&oldset);
ret = filemap_fault(area, vmf);
+ ocfs2_unblock_signals(&oldset);
- error = ocfs2_vm_op_unblock_sigs(&oldset);
- if (error < 0)
- mlog_errno(error);
-out:
mlog_exit_ptr(vmf->page);
return ret;
}
@@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
struct buffer_head *di_bh = NULL;
- sigset_t blocked, oldset;
- int ret, ret2;
+ sigset_t oldset;
+ int ret;
- ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
+ ocfs2_block_signals(&oldset);
/*
* The cluster locks taken will block a truncate from another
@@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ocfs2_inode_unlock(inode, 1);
out:
- ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
- if (ret2 < 0)
- mlog_errno(ret2);
+ ocfs2_unblock_signals(&oldset);
if (ret)
ret = VM_FAULT_SIGBUS;
return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4cbb18f..db5dd3e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -239,6 +239,8 @@ static int ocfs2_mknod(struct inode *dir,
};
int did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
(unsigned long)dev, dentry->d_name.len,
@@ -350,6 +352,10 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
status = dquot_alloc_inode(inode);
if (status)
goto leave;
@@ -384,11 +390,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
ocfs2_add_links_count(dirfe, 1);
- status = ocfs2_journal_dirty(handle, parent_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, parent_fe_bh);
inc_nlink(dir);
}
@@ -439,6 +441,8 @@ leave:
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
if (status == -ENOSPC)
mlog(0, "Disk is full\n");
@@ -487,14 +491,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
int status = 0;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 fe_blkno = 0;
+ u64 suballoc_loc, fe_blkno = 0;
u16 suballoc_bit;
u16 feat;
*new_fe_bh = NULL;
- status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
- inode_ac, &suballoc_bit, &fe_blkno);
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -531,6 +536,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_generation = cpu_to_le32(inode->i_generation);
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = cpu_to_le64(fe_blkno);
+ fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +573,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
}
- status = ocfs2_journal_dirty(handle, *new_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, *new_fe_bh);
ocfs2_populate_inode(inode, fe, 1);
ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -637,6 +639,7 @@ static int ocfs2_link(struct dentry *old_dentry,
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +696,9 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+
err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
@@ -705,14 +711,7 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
- err = ocfs2_journal_dirty(handle, fe_bh);
- if (err < 0) {
- ocfs2_add_links_count(fe, -1);
- drop_nlink(inode);
- mlog_errno(err);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno,
@@ -736,6 +735,7 @@ static int ocfs2_link(struct dentry *old_dentry,
out_commit:
ocfs2_commit_trans(osb, handle);
+ ocfs2_unblock_signals(&oldset);
out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
@@ -909,12 +909,7 @@ static int ocfs2_unlink(struct inode *dir,
drop_nlink(inode);
drop_nlink(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1327,7 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_set_links_count(newfe, 0);
else
ocfs2_add_links_count(newfe, -1);
-
- status = ocfs2_journal_dirty(handle, newfe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, newfe_bh);
} else {
/* if the name was not found in new_dir, add it now */
status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1346,7 @@ static int ocfs2_rename(struct inode *old_dir,
old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
-
- status = ocfs2_journal_dirty(handle, old_inode_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1431,7 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
- status = ocfs2_journal_dirty(handle, old_dir_bh);
+ ocfs2_journal_dirty(handle, old_dir_bh);
}
}
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1550,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
(bytes_left > sb->s_blocksize) ? sb->s_blocksize :
bytes_left);
- status = ocfs2_journal_dirty(handle, bhs[virtual]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[virtual]);
virtual++;
p_blkno++;
@@ -1611,6 +1594,8 @@ static int ocfs2_symlink(struct inode *dir,
};
int did_quota = 0, did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1691,10 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
status = dquot_alloc_inode(inode);
if (status)
goto bail;
@@ -1814,6 +1803,8 @@ bail:
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
brelse(new_fe_bh);
brelse(parent_fe_bh);
@@ -1961,12 +1952,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, 1);
orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2051,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, -1);
orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
leave:
ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2e..c67003b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
/* For struct ocfs2_blockcheck_stats */
#include "blockcheck.h"
+#include "reservations.h"
/* Caching of metadata buffers */
@@ -341,6 +342,9 @@ struct ocfs2_super
*/
unsigned int local_alloc_bits;
unsigned int local_alloc_default_bits;
+ /* osb_clusters_at_boot can become stale! Do not trust it to
+ * be up to date. */
+ unsigned int osb_clusters_at_boot;
enum ocfs2_local_alloc_state local_alloc_state; /* protected
* by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
u64 la_last_gd;
+ struct ocfs2_reservation_map osb_la_resmap;
+
+ unsigned int osb_resv_level;
+ unsigned int osb_dir_resv_level;
+
/* Next three fields are for local node slot recovery during
* mount. */
int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ return 1;
+ return 0;
+}
+
static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
{
if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+ unsigned int clusters)
+{
+ return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
ext2_set_bit(bit, bitmap);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218..33f1c9a 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
| OCFS2_FEATURE_INCOMPAT_XATTR \
| OCFS2_FEATURE_INCOMPAT_META_ECC \
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
- | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
/* Refcount tree support */
#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -283,14 +287,6 @@
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
/*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
-
-/*
* Inline extended attribute size (in bytes)
* The value chosen should be aligned to 16 byte boundaries.
*/
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
block group */
__le32 h_fs_generation; /* Must match super block */
__le64 h_blkno; /* Offset on disk, in blocks */
-/*20*/ __le64 h_reserved3;
+/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
+ eb belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
of next leaf header pointing
to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
/*80*/ struct ocfs2_block_check i_check; /* Error checking */
/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
/*90*/ __le64 i_refcount_loc;
- __le64 i_reserved2[4];
+ __le64 i_suballoc_loc; /* Suballocator block group this
+ inode belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
+/*A0*/ __le64 i_reserved2[3];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
__le32 dr_reserved2;
__le64 dr_free_blk; /* Pointer to head of free
* unindexed block list. */
- __le64 dr_reserved3[15];
+ __le64 dr_suballoc_loc; /* Suballocator block group
+ this root belongs to.
+ Only valid if allocated
+ from a discontiguous
+ block group */
+ __le64 dr_reserved3[14];
union {
struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
* bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
};
/*
+ * Largest bitmap for a block (suballocator) group in bytes. This limit
+ * does not affect cluster groups (global allocator). Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE 256
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
__le64 bg_blkno; /* Offset on disk, in blocks */
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
-/*40*/ __u8 bg_bitmap[0];
+/*40*/ union {
+ __u8 bg_bitmap[0];
+ struct {
+ /*
+ * Block groups may be discontiguous when
+ * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+ * The extents of a discontigous block group are
+ * stored in bg_list. It is a flat list.
+ * l_tree_depth must always be zero. A
+ * discontiguous group is signified by a non-zero
+ * bg_list->l_next_free_rec. Only block groups
+ * can be discontiguous; Cluster groups cannot.
+ * We've never made a block group with more than
+ * 2048 blocks (256 bytes of bg_bitmap). This
+ * codifies that limit so that we can fit bg_list.
+ * bg_size of a discontiguous block group will
+ * be 256 to match bg_bitmap_filler.
+ */
+ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/ struct ocfs2_extent_list bg_list;
+ };
+ };
+/* Actual on-disk size is one block */
};
struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
/*40*/ __le32 rf_generation; /* generation number. all be the same
* for the same refcount tree. */
__le32 rf_reserved0;
- __le64 rf_reserved1[7];
+ __le64 rf_suballoc_loc; /* Suballocator block group this
+ refcount block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
+/*50*/ __le64 rf_reserved1[6];
/*80*/ union {
struct ocfs2_refcount_list rf_records; /* List of refcount
records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
real xattr or a xattr tree. */
__le16 xb_reserved0;
__le32 xb_reserved1;
- __le64 xb_reserved2;
+ __le64 xb_suballoc_loc; /* Suballocator block group this
+ xattr block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
/*30*/ union {
struct ocfs2_xattr_header xb_header; /* xattr header if this
block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
{
int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
return size;
}
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+ int suballocator,
+ u32 feature_incompat)
{
- int size;
-
- size = sb->s_blocksize -
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
{
int size;
size = blocksize -
- offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
- return size;
+ return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
{
int size;
size = blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+ return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+ int suballocator,
+ uint32_t feature_incompat)
+{
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+ if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+ le16_to_cpu(gd->bg_size)) !=
+ offsetof(struct ocfs2_group_desc, bg_list))
+ return 0;
+ /*
+ * Only valid to check l_next_free_rec if
+ * bg_bitmap + bg_size == bg_list.
+ */
+ if (!gd->bg_list.l_next_free_rec)
+ return 0;
+ return 1;
+}
#endif /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab42a74..04ae76d 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -261,10 +261,8 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
brelse(bh);
goto out;
}
- err = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
- if (err < 0)
- goto out;
out:
if (err) {
mutex_unlock(&gqinode->i_mutex);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9ad4930..884b641 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -119,12 +119,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
lock_buffer(bh);
modify(bh, private);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_commit_trans(OCFS2_SB(sb), handle);
- return status;
- }
+ ocfs2_journal_dirty(handle, bh);
+
status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
if (status < 0) {
mlog_errno(status);
@@ -523,9 +519,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
- status = ocfs2_journal_dirty(handle, qbh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, qbh);
out_commit:
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +625,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
lock_buffer(bh);
ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
out_bh:
@@ -1009,11 +1001,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
OCFS2_QBLK_RESERVED_SPACE);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, bh);
/* Initialize new block with structures */
down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
@@ -1040,11 +1028,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
lock_buffer(dbh);
memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
unlock_buffer(dbh);
- status = ocfs2_journal_dirty(handle, dbh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, dbh);
/* Update local quotafile info */
oinfo->dqi_blocks += 2;
@@ -1155,11 +1139,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
lock_buffer(bh);
memset(bh->b_data, 0, sb->s_blocksize);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, bh);
+
/* Update chunk header */
status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
chunk->qc_headerbh,
@@ -1173,11 +1154,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
lock_buffer(chunk->qc_headerbh);
le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
unlock_buffer(chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
/* Update file header */
oinfo->dqi_blocks++;
status = ocfs2_local_write_info(sb, type);
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
- status = 0;
+ ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
out:
/* Clear the read bit so that next time someone uses this
* dquot he reads fresh info from disk and allocates local
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5cbcd0f..4793f36 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
goto out_commit;
}
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&first_blkno);
if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
memset(rb, 0, inode->i_sb->s_blocksize);
strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
if (le32_to_cpu(rb->rf_count) == 1) {
blk = le64_to_cpu(rb->rf_blkno);
bit = le16_to_cpu(rb->rf_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (rb->rf_suballoc_loc)
+ bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
} else if (merge)
ocfs2_refcount_rec_merge(rb, index);
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
return ret;
}
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
int ret;
u16 suballoc_bit_start;
u32 num_got;
- u64 blkno;
+ u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct buffer_head *new_bh = NULL;
struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
goto out;
}
- ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_blkno = cpu_to_le64(blkno);
new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
int ret;
u16 suballoc_bit_start;
u32 num_got, new_cpos;
- u64 blkno;
+ u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *root_rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
goto out;
}
- ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
memset(new_rb, 0, sb->s_blocksize);
strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
* 2 more credits, one for the leaf refcount block, one for
* the extent block contains the extent rec.
*/
- ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+ ret = ocfs2_extend_trans(handle, 2);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
if (merge)
ocfs2_refcount_rec_merge(rb, index);
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
if (index == 0) {
ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
ocfs2_refcount_rec_merge(rb, index);
}
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
*/
ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(rb->rf_suballoc_slot),
+ le64_to_cpu(rb->rf_suballoc_loc),
le64_to_cpu(rb->rf_blkno),
le16_to_cpu(rb->rf_suballoc_bit));
if (ret) {
@@ -2516,20 +2515,19 @@ out:
*
* Normally the refcount blocks store these refcount should be
* contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
- * 2 more refcount block, so just check it in a rough way.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
*
* Caller must hold refcount tree lock.
*/
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
- struct buffer_head *di_bh,
+ u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
- struct ocfs2_alloc_context **meta_ac)
+ int *ref_blocks)
{
- int ret, ref_blocks = 0;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_refcount_loc), &tree);
+ refcount_loc, &tree);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_read_refcount_block(&tree->rf_ci,
- le64_to_cpu(di->i_refcount_loc),
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
&ref_root_bh);
if (ret) {
mlog_errno(ret);
@@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
&tree->rf_ci,
ref_root_bh,
start_cpos, clusters,
- &ref_blocks, credits);
+ ref_blocks, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "reserve new metadata %d, credits = %d\n",
- ref_blocks, *credits);
-
- if (ref_blocks) {
- ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
- ref_blocks, meta_ac);
- if (ret)
- mlog_errno(ret);
- }
+ mlog(0, "reserve new metadata %d blocks, credits = %d\n",
+ *ref_blocks, *credits);
out:
brelse(ref_root_bh);
@@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
}
memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
- ret = ocfs2_journal_dirty(handle, new_bh);
- if (ret) {
- mlog_errno(ret);
- break;
- }
+ ocfs2_journal_dirty(handle, new_bh);
brelse(new_bh);
brelse(old_bh);
@@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
} else {
delete = 1;
- ret = __ocfs2_claim_clusters(osb, handle,
+ ret = __ocfs2_claim_clusters(handle,
context->data_ac,
1, set_len,
&new_bit, &new_len);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1..9983ba1 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int delete);
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
- struct buffer_head *di_bh,
+ u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
- struct ocfs2_alloc_context **meta_ac);
+ int *ref_blocks);
int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 0000000..4065002
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define OCFS2_MIN_RESV_WINDOW_BITS 8
+#define OCFS2_MAX_RESV_WINDOW_BITS 1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+ return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ unsigned int bits;
+
+ if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+ /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+ bits = 4 << osb->osb_resv_level;
+ } else {
+ bits = 4 << osb->osb_dir_resv_level;
+ }
+ return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_len)
+ return resv->r_start + resv->r_len - 1;
+ return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+ return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+ if (resmap->m_osb->osb_resv_level == 0)
+ return 1;
+ return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+ int i = 0;
+
+ mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+ osb->dev_str, resmap->m_bitmap_len);
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+ "\tlast_len: %u\n", resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ node = rb_next(node);
+ i++;
+ }
+
+ mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+ i = 0;
+ list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+ mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+ "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ i++;
+ }
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+ int i,
+ struct ocfs2_alloc_reservation *resv)
+{
+ char *disk_bitmap = resmap->m_disk_bitmap;
+ unsigned int start = resv->r_start;
+ unsigned int end = ocfs2_resv_end(resv);
+
+ while (start <= end) {
+ if (ocfs2_test_bit(start, disk_bitmap)) {
+ mlog(ML_ERROR,
+ "reservation %d covers an allocated area "
+ "starting at bit %u!\n", i, start);
+ return 1;
+ }
+
+ start++;
+ }
+ return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+ unsigned int off = 0;
+ int i = 0;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (i > 0 && resv->r_start <= off) {
+ mlog(ML_ERROR, "reservation %d has bad start off!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_len == 0) {
+ mlog(ML_ERROR, "reservation %d has no length!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_start > ocfs2_resv_end(resv)) {
+ mlog(ML_ERROR, "reservation %d has invalid range!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+ mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_validate_resmap_bits(resmap, i, resv))
+ goto bad;
+
+ off = ocfs2_resv_end(resv);
+ node = rb_next(node);
+
+ i++;
+ }
+ return;
+
+bad:
+ ocfs2_dump_resv(resmap);
+ BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+ memset(resv, 0, sizeof(*resv));
+ INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags)
+{
+ BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+ resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap)
+{
+ memset(resmap, 0, sizeof(*resmap));
+
+ resmap->m_osb = osb;
+ resmap->m_reservations = RB_ROOT;
+ /* m_bitmap_len is initialized to zero by the above memset. */
+ INIT_LIST_HEAD(&resmap->m_lru);
+
+ return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ if (!list_empty(&resv->r_lru))
+ list_del_init(&resv->r_lru);
+
+ list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+ resv->r_len = 0;
+ resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+ list_del_init(&resv->r_lru);
+ rb_erase(&resv->r_node, &resmap->m_reservations);
+ resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+ }
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ __ocfs2_resv_trunc(resv);
+ /*
+ * last_len and last_start no longer make sense if
+ * we're changing the range of our allocations.
+ */
+ resv->r_last_len = resv->r_last_start = 0;
+
+ ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv) {
+ spin_lock(&resv_lock);
+ __ocfs2_resv_discard(resmap, resv);
+ spin_unlock(&resv_lock);
+ }
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ assert_spin_locked(&resv_lock);
+
+ while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ __ocfs2_resv_discard(resmap, resv);
+ }
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap)
+{
+ if (ocfs2_resmap_disabled(resmap))
+ return;
+
+ spin_lock(&resv_lock);
+
+ ocfs2_resmap_clear_all_resv(resmap);
+ resmap->m_bitmap_len = clen;
+ resmap->m_disk_bitmap = disk_bitmap;
+
+ spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+ /* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *new)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &root->rb_node;
+ struct ocfs2_alloc_reservation *tmp;
+
+ assert_spin_locked(&resv_lock);
+
+ mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+ new->r_len);
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+ if (new->r_start < tmp->r_start) {
+ p = &(*p)->rb_left;
+
+ /*
+ * This is a good place to check for
+ * overlapping reservations.
+ */
+ BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+ } else if (new->r_start > ocfs2_resv_end(tmp)) {
+ p = &(*p)->rb_right;
+ } else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate reservation window!\n");
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, root);
+ new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+ ocfs2_resv_mark_lru(resmap, new);
+
+ ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+ struct ocfs2_alloc_reservation *resv = NULL;
+ struct ocfs2_alloc_reservation *prev_resv = NULL;
+ struct rb_node *node = resmap->m_reservations.rb_node;
+
+ assert_spin_locked(&resv_lock);
+
+ if (!node)
+ return NULL;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+ break;
+
+ /* Check if we overshot the reservation just before goal? */
+ if (resv->r_start > goal) {
+ resv = prev_resv;
+ break;
+ }
+
+ prev_resv = resv;
+ node = rb_next(node);
+ }
+
+ return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+ unsigned int wanted,
+ unsigned int search_start,
+ unsigned int search_len,
+ unsigned int *rstart,
+ unsigned int *rlen)
+{
+ void *bitmap = resmap->m_disk_bitmap;
+ unsigned int best_start, best_len = 0;
+ int offset, start, found;
+
+ mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+ wanted, search_start, search_len, resmap->m_bitmap_len);
+
+ found = best_start = best_len = 0;
+
+ start = search_start;
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+ start)) != -1) {
+ /* Search reached end of the region */
+ if (offset >= (search_start + search_len))
+ break;
+
+ if (offset == start) {
+ /* we found a zero */
+ found++;
+ /* move start to the next bit to test */
+ start++;
+ } else {
+ /* got a zero after some ones */
+ found = 1;
+ start = offset + 1;
+ }
+ if (found > best_len) {
+ best_len = found;
+ best_start = start - found;
+ }
+
+ if (found >= wanted)
+ break;
+ }
+
+ if (best_len == 0)
+ return 0;
+
+ if (best_len >= wanted)
+ best_len = wanted;
+
+ *rlen = best_len;
+ *rstart = best_start;
+
+ mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+
+ return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int goal, unsigned int wanted)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ unsigned int gap_start, gap_end, gap_len;
+ struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+ struct rb_node *prev, *next;
+ unsigned int cstart, clen;
+ unsigned int best_start = 0, best_len = 0;
+
+ /*
+ * Nasty cases to consider:
+ *
+ * - rbtree is empty
+ * - our window should be first in all reservations
+ * - our window should be last in all reservations
+ * - need to make sure we don't go past end of bitmap
+ */
+
+ mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+ resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+
+ assert_spin_locked(&resv_lock);
+
+ if (RB_EMPTY_ROOT(root)) {
+ /*
+ * Easiest case - empty tree. We can just take
+ * whatever window of free bits we want.
+ */
+
+ mlog(0, "Empty root\n");
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ resmap->m_bitmap_len - goal,
+ &cstart, &clen);
+
+ /*
+ * This should never happen - the local alloc window
+ * will always have free bits when we're called.
+ */
+ BUG_ON(goal == 0 && clen == 0);
+
+ if (clen == 0)
+ return;
+
+ resv->r_start = cstart;
+ resv->r_len = clen;
+
+ ocfs2_resv_insert(resmap, resv);
+ return;
+ }
+
+ prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+ if (prev_resv == NULL) {
+ mlog(0, "Goal on LHS of leftmost window\n");
+
+ /*
+ * A NULL here means that the search code couldn't
+ * find a window that starts before goal.
+ *
+ * However, we can take the first window after goal,
+ * which is also by definition, the leftmost window in
+ * the entire tree. If we can find free bits in the
+ * gap between goal and the LHS window, then the
+ * reservation can safely be placed there.
+ *
+ * Otherwise we fall back to a linear search, checking
+ * the gaps in between windows for a place to
+ * allocate.
+ */
+
+ next = rb_first(root);
+ next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+ r_node);
+
+ /*
+ * The search should never return such a window. (see
+ * comment above
+ */
+ if (next_resv->r_start <= goal) {
+ mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+ goal, next_resv->r_start, next_resv->r_len);
+ ocfs2_dump_resv(resmap);
+ BUG();
+ }
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ next_resv->r_start - goal,
+ &cstart, &clen);
+ if (clen) {
+ best_len = clen;
+ best_start = cstart;
+ if (best_len == wanted)
+ goto out_insert;
+ }
+
+ prev_resv = next_resv;
+ next_resv = NULL;
+ }
+
+ prev = &prev_resv->r_node;
+
+ /* Now we do a linear search for a window, starting at 'prev_rsv' */
+ while (1) {
+ next = rb_next(prev);
+ if (next) {
+ mlog(0, "One more resv found in linear search\n");
+ next_resv = rb_entry(next,
+ struct ocfs2_alloc_reservation,
+ r_node);
+
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_end = next_resv->r_start - 1;
+ gap_len = gap_end - gap_start + 1;
+ } else {
+ mlog(0, "No next node\n");
+ /*
+ * We're at the rightmost edge of the
+ * tree. See if a reservation between this
+ * window and the end of the bitmap will work.
+ */
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_len = resmap->m_bitmap_len - gap_start;
+ gap_end = resmap->m_bitmap_len - 1;
+ }
+
+ /*
+ * No need to check this gap if we have already found
+ * a larger region of free bits.
+ */
+ if (gap_len <= best_len)
+ goto next_resv;
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+ gap_len, &cstart, &clen);
+ if (clen == wanted) {
+ best_len = clen;
+ best_start = cstart;
+ goto out_insert;
+ } else if (clen > best_len) {
+ best_len = clen;
+ best_start = cstart;
+ }
+
+next_resv:
+ if (!next)
+ break;
+
+ prev = next;
+ prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+ r_node);
+ }
+
+out_insert:
+ if (best_len) {
+ resv->r_start = best_start;
+ resv->r_len = best_len;
+ ocfs2_resv_insert(resmap, resv);
+ }
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ struct ocfs2_alloc_reservation *lru_resv;
+ int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+ unsigned int min_bits;
+
+ if (!tmpwindow)
+ min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+ else
+ min_bits = wanted; /* We at know the temp window will use all
+ * of these bits */
+
+ /*
+ * Take the first reservation off the LRU as our 'target'. We
+ * don't try to be smart about it. There might be a case for
+ * searching based on size but I don't have enough data to be
+ * sure. --Mark (3/16/2010)
+ */
+ lru_resv = list_first_entry(&resmap->m_lru,
+ struct ocfs2_alloc_reservation, r_lru);
+
+ mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+ lru_resv->r_len, ocfs2_resv_end(lru_resv));
+
+ /*
+ * Cannibalize (some or all) of the target reservation and
+ * feed it to the current window.
+ */
+ if (lru_resv->r_len <= min_bits) {
+ /*
+ * Discard completely if size is less than or equal to a
+ * reasonable threshold - 50% of window bits for non temporary
+ * windows.
+ */
+ resv->r_start = lru_resv->r_start;
+ resv->r_len = lru_resv->r_len;
+
+ __ocfs2_resv_discard(resmap, lru_resv);
+ } else {
+ unsigned int shrink;
+ if (tmpwindow)
+ shrink = min_bits;
+ else
+ shrink = lru_resv->r_len / 2;
+
+ lru_resv->r_len -= shrink;
+
+ resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+ resv->r_len = shrink;
+ }
+
+ mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+ "r_len: %u r_last_start: %u r_last_len: %u\n",
+ resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+ resv->r_last_start, resv->r_last_len);
+
+ ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ unsigned int goal = 0;
+
+ BUG_ON(!ocfs2_resv_empty(resv));
+
+ /*
+ * Begin by trying to get a window as close to the previous
+ * one as possible. Using the most recent allocation as a
+ * start goal makes sense.
+ */
+ if (resv->r_last_len) {
+ goal = resv->r_last_start + resv->r_last_len;
+ if (goal >= resmap->m_bitmap_len)
+ goal = 0;
+ }
+
+ __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+ /* Search from last alloc didn't work, try once more from beginning. */
+ if (ocfs2_resv_empty(resv) && goal != 0)
+ __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * Still empty? Pull oldest one off the LRU, remove it from
+ * tree, put this one in it's place.
+ */
+ ocfs2_cannibalize_resv(resmap, resv, wanted);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen)
+{
+ unsigned int wanted = *clen;
+
+ if (resv == NULL || ocfs2_resmap_disabled(resmap))
+ return -ENOSPC;
+
+ spin_lock(&resv_lock);
+
+ /*
+ * We don't want to over-allocate for temporary
+ * windows. Otherwise, we run the risk of fragmenting the
+ * allocation space.
+ */
+ wanted = ocfs2_resv_window_bits(resmap, resv);
+ if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+ wanted = *clen;
+
+ if (ocfs2_resv_empty(resv)) {
+ mlog(0, "empty reservation, find new window\n");
+
+ /*
+ * Try to get a window here. If it works, we must fall
+ * through and test the bitmap . This avoids some
+ * ping-ponging of windows due to non-reserved space
+ * being allocation before we initialize a window for
+ * that inode.
+ */
+ ocfs2_resv_find_window(resmap, resv, wanted);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+
+ *cstart = resv->r_start;
+ *clen = resv->r_len;
+
+ spin_unlock(&resv_lock);
+ return 0;
+}
+
+static void
+ ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int start, unsigned int end)
+{
+ unsigned int rhs = 0;
+ unsigned int old_end = ocfs2_resv_end(resv);
+
+ BUG_ON(start != resv->r_start || old_end < end);
+
+ /*
+ * Completely used? We can remove it then.
+ */
+ if (old_end == end) {
+ __ocfs2_resv_discard(resmap, resv);
+ return;
+ }
+
+ rhs = old_end - end;
+
+ /*
+ * This should have been trapped above.
+ */
+ BUG_ON(rhs == 0);
+
+ resv->r_start = end + 1;
+ resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen)
+{
+ unsigned int cend = cstart + clen - 1;
+
+ if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+ return;
+
+ if (resv == NULL)
+ return;
+
+ BUG_ON(cstart != resv->r_start);
+
+ spin_lock(&resv_lock);
+
+ mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+ "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+ cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start, resv->r_last_len);
+
+ BUG_ON(cstart < resv->r_start);
+ BUG_ON(cstart > ocfs2_resv_end(resv));
+ BUG_ON(cend > ocfs2_resv_end(resv));
+
+ ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+ resv->r_last_start = cstart;
+ resv->r_last_len = clen;
+
+ /*
+ * May have been discarded above from
+ * ocfs2_adjust_resv_from_alloc().
+ */
+ if (!ocfs2_resv_empty(resv))
+ ocfs2_resv_mark_lru(resmap, resv);
+
+ mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+ "r_len: %u r_last_start: %u r_last_len: %u\n",
+ resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+ resv->r_last_start, resv->r_last_len);
+
+ ocfs2_check_resmap(resmap);
+
+ spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 0000000..1e49cc2
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL 2
+#define OCFS2_MAX_RESV_LEVEL 9
+#define OCFS2_MIN_RESV_LEVEL 0
+
+struct ocfs2_alloc_reservation {
+ struct rb_node r_node;
+
+ unsigned int r_start; /* Begining of current window */
+ unsigned int r_len; /* Length of the window */
+
+ unsigned int r_last_len; /* Length of most recent alloc */
+ unsigned int r_last_start; /* Start of most recent alloc */
+ struct list_head r_lru; /* LRU list head */
+
+ unsigned int r_flags;
+};
+
+#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
+ * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
+ * directory btree */
+
+struct ocfs2_reservation_map {
+ struct rb_root m_reservations;
+ char *m_disk_bitmap;
+
+ struct ocfs2_super *m_osb;
+
+ /* The following are not initialized to meaningful values until a disk
+ * bitmap is provided. */
+ u32 m_bitmap_len; /* Number of valid
+ * bits available */
+
+ struct list_head m_lru; /* LRU of reservations
+ * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen);
+
+#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673..dacd553 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
"Force to do offline resize.");
ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small."
" Force to do offline resize.");
ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
group = (struct ocfs2_group_desc *)group_bh->b_data;
group->bg_next_group = cr->c_blkno;
-
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, group_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 19ba00f..f4c2a9e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
#define OCFS2_MAX_TO_STEAL 1024
+struct ocfs2_suballoc_result {
+ u64 sr_bg_blkno; /* The bg we allocated from. Set
+ to 0 when a block group is
+ contiguous. */
+ u64 sr_blkno; /* The first allocated block */
+ unsigned int sr_bit_offset; /* The bit in the bg */
+ unsigned int sr_bits; /* How many bits we claimed */
+};
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
}
brelse(ac->ac_bh);
ac->ac_bh = NULL;
+ ac->ac_resv = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +333,38 @@ out:
return rc;
}
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_chain_list *cl,
+ u64 p_blkno, u32 clusters)
+{
+ struct ocfs2_extent_list *el = &bg->bg_list;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(!ocfs2_supports_discontig_bg(osb));
+ if (!el->l_next_free_rec)
+ el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+ rec->e_blkno = cpu_to_le64(p_blkno);
+ rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+ le16_to_cpu(cl->cl_bpc));
+ rec->e_leaf_clusters = cpu_to_le32(clusters);
+ le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&bg->bg_free_bits_count,
+ clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct super_block * sb = alloc_inode->i_sb;
@@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
- bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
- bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+ osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
bg->bg_blkno = cpu_to_le64(group_blkno);
+ if (group_clusters == le16_to_cpu(cl->cl_cpg))
+ bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ else
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+ group_clusters);
+
/* set the 1st bit in the bitmap to account for the descriptor block */
ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bg_bh);
/* There is no need to zero out or otherwise initialize the
* other blocks in a group - All valid FS metadata in a block
@@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
return best;
}
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ struct buffer_head *bg_bh;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+ status = ocfs2_claim_clusters(handle, ac,
+ le16_to_cpu(cl->cl_cpg), &bit_off,
+ &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "new descriptor, record %u, at block %llu\n",
+ alloc_rec, (unsigned long long)bg_blkno);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ brelse(bg_bh);
+ mlog_errno(status);
+ }
+
+bail:
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ unsigned int min_bits,
+ u32 *bit_off, u32 *num_bits)
+{
+ int status = 0;
+
+ while (min_bits) {
+ status = ocfs2_claim_clusters(handle, ac, min_bits,
+ bit_off, num_bits);
+ if (status != -ENOSPC)
+ break;
+
+ min_bits >>= 1;
+ }
+
+ return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl,
+ unsigned int min_bits)
+{
+ int status;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+ struct ocfs2_group_desc *bg =
+ (struct ocfs2_group_desc *)bg_bh->b_data;
+ unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ u32 p_cpos, clusters;
+ u64 p_blkno;
+ struct ocfs2_extent_list *el = &bg->bg_list;
+
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+ le16_to_cpu(el->l_count))) {
+ if (min_bits > needed)
+ min_bits = needed;
+ status = ocfs2_block_group_claim_bits(osb, handle, ac,
+ min_bits, &p_cpos,
+ &clusters);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+ clusters);
+
+ min_bits = clusters;
+ needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ }
+
+ if (needed > 0) {
+ /*
+ * We have used up all the extent rec but can't fill up
+ * the cpg. So bail out.
+ */
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+ return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+ struct ocfs2_alloc_context *cluster_ac,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh)
+{
+ int i, ret;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ if (!bg_bh)
+ return;
+
+ bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+ el = &bg->bg_list;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+ ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+ cluster_ac->ac_bh,
+ le64_to_cpu(rec->e_blkno),
+ le32_to_cpu(rec->e_leaf_clusters));
+ if (ret)
+ mlog_errno(ret);
+ /* Try all the clusters to free */
+ }
+
+ ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+ brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+ struct buffer_head *bg_bh = NULL;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+ if (!ocfs2_supports_discontig_bg(osb)) {
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ status = ocfs2_extend_trans(handle,
+ ocfs2_calc_bg_discontig_credits(osb->sb));
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /*
+ * We're going to be grabbing from multiple cluster groups.
+ * We don't have enough credits to relink them all, and the
+ * cluster groups will be staying in cache for the duration of
+ * this operation.
+ */
+ ac->ac_allow_chain_relink = 0;
+
+ /* Claim the first region */
+ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+ &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ min_bits = num_bits;
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "new descriptor, record %u, at block %llu\n",
+ alloc_rec, (unsigned long long)bg_blkno);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+ bg_bh, ac, cl, min_bits);
+ if (status)
+ mlog_errno(status);
+
+bail:
+ if (status)
+ ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
/*
* We expect the block group allocator to already be locked.
*/
@@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct ocfs2_chain_list *cl;
struct ocfs2_alloc_context *ac = NULL;
handle_t *handle = NULL;
- u32 bit_off, num_bits;
u16 alloc_rec;
- u64 bg_blkno;
struct buffer_head *bg_bh = NULL;
struct ocfs2_group_desc *bg;
@@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
(unsigned long long)*last_alloc_group);
ac->ac_last_group = *last_alloc_group;
}
- status = ocfs2_claim_clusters(osb,
- handle,
- ac,
- le16_to_cpu(cl->cl_cpg),
- &bit_off,
- &num_bits);
- if (status < 0) {
+
+ bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ bg_bh = ocfs2_block_group_alloc_discontig(handle,
+ alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh)) {
+ status = PTR_ERR(bg_bh);
+ bg_bh = NULL;
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
-
- alloc_rec = ocfs2_find_smallest_chain(cl);
-
- /* setup the group */
- bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "new descriptor, record %u, at block %llu\n",
- alloc_rec, (unsigned long long)bg_blkno);
-
- bg_bh = sb_getblk(osb->sb, bg_blkno);
- if (!bg_bh) {
- status = -EIO;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-
- status = ocfs2_block_group_fill(handle,
- alloc_inode,
- bg_bh,
- bg_blkno,
- alloc_rec,
- cl);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
goto bail;
}
+ alloc_rec = le16_to_cpu(bg->bg_chain);
le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
le16_to_cpu(bg->bg_free_bits_count));
- le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
+ le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+ le16_to_cpu(bg->bg_bits));
+ cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno);
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
status = ocfs2_reserve_suballoc_bits(osb, (*ac),
EXTENT_ALLOC_SYSTEM_INODE,
(u32)osb->slot_num, NULL,
- ALLOC_NEW_GROUP);
+ ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
if (status >= 0) {
@@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
status = ocfs2_reserve_local_alloc_bits(osb,
bits_wanted,
*ac);
- if (status == -EFBIG) {
- /* The local alloc window is outside ac_max_block.
- * use the main bitmap. */
- status = -ENOSPC;
- } else if ((status < 0) && (status != -ENOSPC)) {
+ if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
}
@@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
unsigned int total_bits,
- u16 *bit_off,
- u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
void *bitmap;
u16 best_offset, best_size;
@@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
- /* XXX: I think the first clause is equivalent to the second
- * - jlbec */
- if (found == bits_wanted) {
- *bit_off = start - found;
- *bits_found = found;
- } else if (best_size) {
- *bit_off = best_offset;
- *bits_found = best_size;
+ if (best_size) {
+ res->sr_bit_offset = best_offset;
+ res->sr_bits = best_size;
} else {
status = -ENOSPC;
/* No error log here -- see the comment above
@@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
- status = ocfs2_journal_dirty(handle,
- group_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, group_bh);
bail:
mlog_exit(status);
@@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
}
prev_bg->bg_next_group = bg->bg_next_group;
-
- status = ocfs2_journal_dirty(handle, prev_bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, prev_bg_bh);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
}
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
-
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, bg_bh);
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
}
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+ ocfs2_journal_dirty(handle, fe_bh);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = 0;
out_rollback:
if (status < 0) {
fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int search = -ENOSPC;
int ret;
u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- u16 tmp_off, tmp_found;
unsigned int max_bits, gd_cluster_off;
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
- max_bits,
- &tmp_off, &tmp_found);
+ max_bits, res);
if (ret)
return ret;
if (max_block) {
blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
gd_cluster_off +
- tmp_off + tmp_found);
+ res->sr_bit_offset +
+ res->sr_bits);
mlog(0, "Checking %llu against %llu\n",
(unsigned long long)blkoff,
(unsigned long long)max_block);
@@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
* of bits. */
- if (min_bits <= tmp_found) {
- *bit_off = tmp_off;
- *bits_found = tmp_found;
+ if (min_bits <= res->sr_bits)
search = 0; /* success */
- } else if (tmp_found) {
+ else if (res->sr_bits) {
/*
* Don't show bits which we'll be returning
* for allocation to the local alloc bitmap.
*/
- ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+ ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
}
}
@@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int ret = -ENOSPC;
u64 blkoff;
@@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
- bit_off, bits_found);
+ res);
if (!ret && max_block) {
- blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
- *bits_found;
+ blkoff = le64_to_cpu(bg->bg_blkno) +
+ res->sr_bit_offset + res->sr_bits;
mlog(0, "Checking %llu against %llu\n",
(unsigned long long)blkoff,
(unsigned long long)max_block);
@@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out:
return ret;
}
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_chain_list *cl)
+{
+ unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+ unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+ unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+
+ if (res->sr_bit_offset < bitoff)
+ return 0;
+ if (res->sr_bit_offset >= (bitoff + bitcount))
+ return 0;
+ res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+ (res->sr_bit_offset - bitoff);
+ if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+ res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+ return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_suballoc_result *res)
+{
+ int i;
+ u64 bg_blkno = res->sr_bg_blkno; /* Save off */
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+ struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+ res->sr_blkno = 0;
+ return;
+ }
+
+ res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+ res->sr_bg_blkno = 0; /* Clear it for contig block groups */
+ if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+ !bg->bg_list.l_next_free_rec)
+ return;
+
+ for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+ rec = &bg->bg_list.l_recs[i];
+ if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+ res->sr_bg_blkno = bg_blkno; /* Restore */
+ break;
+ }
+ }
+}
+
static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 gd_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int ret;
- u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
- &group_bh);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di,
+ res->sr_bg_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
gd = (struct ocfs2_group_desc *) group_bh->b_data;
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- ac->ac_max_block, bit_off, &found);
+ ac->ac_max_block, res);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
- *num_bits = found;
+ if (!ret)
+ ocfs2_bg_discontig_fix_result(ac, gd, res);
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
- *num_bits,
+ res->sr_bits,
le16_to_cpu(gd->bg_chain));
if (ret < 0) {
mlog_errno(ret);
@@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- *bit_off, *num_bits);
+ res->sr_bit_offset, res->sr_bits);
if (ret < 0)
mlog_errno(ret);
@@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int status;
- u16 chain, tmp_bits;
+ u16 chain;
u32 tmp_used;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
@@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
bits_wanted, min_bits,
- ac->ac_max_block, bit_off,
- &tmp_bits)) == -ENOSPC) {
+ ac->ac_max_block,
+ res)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
@@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
- tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+ res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
- *num_bits = tmp_bits;
+ res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+
+ BUG_ON(res->sr_bits == 0);
+ if (!status)
+ ocfs2_bg_discontig_fix_result(ac, bg, res);
- BUG_ON(*num_bits == 0);
/*
* Keep track of previous block descriptor read. When
@@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
*/
if (ac->ac_allow_chain_relink &&
(prev_group_bh) &&
- (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+ (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
status = ocfs2_relink_block_group(handle, alloc_inode,
ac->ac_bh, group_bh,
prev_group_bh, chain);
@@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
-
- status = ocfs2_journal_dirty(handle,
- ac->ac_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+ ocfs2_journal_dirty(handle, ac->ac_bh);
status = ocfs2_block_group_set_bits(handle,
alloc_inode,
bg,
group_bh,
- *bit_off,
- *num_bits);
+ res->sr_bit_offset,
+ res->sr_bits);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+ mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
(unsigned long long)le64_to_cpu(fe->i_blkno));
- *bg_blkno = le64_to_cpu(bg->bg_blkno);
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
brelse(group_bh);
@@ -1593,19 +1836,15 @@ bail:
}
/* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno)
+ struct ocfs2_suballoc_result *res)
{
int status;
u16 victim, i;
u16 bits_left = 0;
- u64 hint_blkno = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
@@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+ ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used "
"bits but only %u total.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (hint_blkno) {
+ res->sr_bg_blkno = ac->ac_last_group;
+ if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
* allocation group. This helps us mantain some
* contiguousness across allocations. */
status = ocfs2_search_one_group(ac, handle, bits_wanted,
- min_bits, bit_off, num_bits,
- hint_blkno, &bits_left);
- if (!status) {
- /* Be careful to update *bg_blkno here as the
- * caller is expecting it to be filled in, and
- * ocfs2_search_one_group() won't do that for
- * us. */
- *bg_blkno = hint_blkno;
+ min_bits, res, &bits_left);
+ if (!status)
goto set_hint;
- }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
ac->ac_chain = victim;
ac->ac_allow_chain_relink = 1;
- status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
- num_bits, bg_blkno, &bits_left);
+ status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+ res, &bits_left);
if (!status)
goto set_hint;
if (status < 0 && status != -ENOSPC) {
@@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
ac->ac_chain = i;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
- bit_off, num_bits, bg_blkno,
- &bits_left);
+ res, &bits_left);
if (!status)
break;
if (status < 0 && status != -ENOSPC) {
@@ -1703,7 +1936,7 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = *bg_blkno;
+ ac->ac_last_group = res->sr_bg_blkno;
}
bail:
@@ -1711,37 +1944,37 @@ bail:
return status;
}
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
unsigned int *num_bits,
u64 *blkno_start)
{
int status;
- u64 bg_blkno;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
BUG_ON(!ac);
BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
1,
- suballoc_bit_start,
- num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
- ac->ac_bits_given += (*num_bits);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit_start = res.sr_bit_offset;
+ *blkno_start = res.sr_blkno;
+ ac->ac_bits_given += res.sr_bits;
+ *num_bits = res.sr_bits;
status = 0;
bail:
mlog_exit(status);
@@ -1749,10 +1982,10 @@ bail:
}
static void ocfs2_init_inode_ac_group(struct inode *dir,
- struct buffer_head *parent_fe_bh,
+ struct buffer_head *parent_di_bh,
struct ocfs2_alloc_context *ac)
{
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
/*
* Try to allocate inodes from some specific group.
*
@@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
if (OCFS2_I(dir)->ip_last_used_group &&
OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
- else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
- ac->ac_last_group = ocfs2_which_suballoc_group(
- le64_to_cpu(fe->i_blkno),
- le16_to_cpu(fe->i_suballoc_bit));
+ else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+ if (di->i_suballoc_loc)
+ ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+ else
+ ac->ac_last_group = ocfs2_which_suballoc_group(
+ le64_to_cpu(di->i_blkno),
+ le16_to_cpu(di->i_suballoc_bit));
+ }
}
static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
}
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno)
{
int status;
- unsigned int num_bits;
- u64 bg_blkno;
+ struct ocfs2_suballoc_result res;
mlog_entry_void();
@@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
1,
1,
- suballoc_bit,
- &num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- BUG_ON(num_bits != 1);
+ BUG_ON(res.sr_bits != 1);
- *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit = res.sr_bit_offset;
+ *fe_blkno = res.sr_blkno;
ac->ac_bits_given++;
ocfs2_save_inode_ac_group(dir, ac);
status = 0;
@@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
* contig. allocation, set to '1' to indicate we can deal with extents
* of any size.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
@@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
{
int status;
unsigned int bits_wanted = max_clusters;
- u64 bg_blkno = 0;
- u16 bg_bit_off;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+ struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
mlog_entry_void();
@@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
&& ac->ac_which != OCFS2_AC_USE_MAIN);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+ WARN_ON(min_clusters > 1);
+
status = ocfs2_claim_local_alloc_bits(osb,
handle,
ac,
@@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
if (bits_wanted > (osb->bitmap_cpg - 1))
bits_wanted = osb->bitmap_cpg - 1;
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
min_clusters,
- &bg_bit_off,
- num_clusters,
- &bg_blkno);
+ &res);
if (!status) {
+ BUG_ON(res.sr_blkno); /* cluster alloc can't set */
*cluster_start =
ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
- bg_blkno,
- bg_bit_off);
+ res.sr_bg_blkno,
+ res.sr_bit_offset);
atomic_inc(&osb->alloc_stats.bitmap_data);
+ *num_clusters = res.sr_bits;
}
}
if (status < 0) {
@@ -1958,8 +2193,7 @@ bail:
return status;
}
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
{
unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
- return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+ return __ocfs2_claim_clusters(handle, ac, min_clusters,
bits_wanted, cluster_start, num_clusters);
}
@@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
if (undo_fn)
jbd_unlock_bh_state(group_bh);
- status = ocfs2_journal_dirty(handle, group_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, group_bh);
bail:
return status;
}
@@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
count);
tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
- status = ocfs2_journal_dirty(handle, alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, alloc_bh);
bail:
brelse(group_bh);
@@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle,
u16 bit = le16_to_cpu(di->i_suballoc_bit);
u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (di->i_suballoc_loc)
+ bg_blkno = le64_to_cpu(di->i_suballoc_loc);
return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
inode_alloc_bh, bit, bg_blkno, 1);
}
@@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
struct buffer_head *alloc_bh, u64 blkno,
u16 bit, int *res)
{
- struct ocfs2_dinode *alloc_fe;
+ struct ocfs2_dinode *alloc_di;
struct ocfs2_group_desc *group;
struct buffer_head *group_bh = NULL;
u64 bg_blkno;
@@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
(unsigned int)bit);
- alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
- if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+ alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+ if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
(unsigned int)bit,
- ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+ ocfs2_bits_per_group(&alloc_di->id2.i_chain));
status = -EINVAL;
goto bail;
}
- bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
- status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+ if (alloc_di->i_suballoc_loc)
+ bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+ status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
&group_bh);
if (status < 0) {
mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df..a017dd3 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
u32, /* bits_wanted */
u32, /* min_bits */
u64, /* max_block */
- u16 *, /* *bit_off */
- u16 *); /* *bits_found */
+ struct ocfs2_suballoc_result *);
+ /* found bits */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+
+ struct ocfs2_alloc_reservation *ac_resv;
};
void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
* Use this variant of ocfs2_claim_clusters to specify a maxiumum
* number of clusters smaller than the allocation reserved.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee0319..1c2c39f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
- unsigned int localalloc_opt;
+ int localalloc_opt;
+ unsigned int resv_level;
+ int dir_resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
@@ -176,6 +178,8 @@ enum {
Opt_noacl,
Opt_usrquota,
Opt_grpquota,
+ Opt_resv_level,
+ Opt_dir_resv_level,
Opt_err,
};
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_resv_level, "resv_level=%u"},
+ {Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
};
@@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
- osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+ ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+ osb->osb_resv_level = parsed_options.resv_level;
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ if (parsed_options.dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ else
+ osb->osb_dir_resv_level = parsed_options.dir_resv_level;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
options ? options : "(none)");
mopt->commit_interval = 0;
- mopt->mount_opt = 0;
+ mopt->mount_opt = OCFS2_MOUNT_NOINTR;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
- mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->localalloc_opt = -1;
mopt->cluster_stack[0] = '\0';
+ mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
if (!options) {
status = 1;
@@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
status = 0;
goto bail;
}
- if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ if (option >= 0)
mopt->localalloc_opt = option;
break;
case Opt_localflocks:
@@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
break;
+ case Opt_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->resv_level = option;
+ break;
+ case Opt_dir_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = option;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
(unsigned) (osb->osb_commit_interval / HZ));
local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
- if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ if (local_alloc_megs != ocfs2_la_default_mb(osb))
seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
else
seq_printf(s, ",noacl");
+ if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+ seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+ if (osb->osb_dir_resv_level != osb->osb_resv_level)
+ seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
return 0;
}
@@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
+ status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+ osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
iput(inode);
- osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+ osb->s_feature_incompat) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
@@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
ocfs2_handle_error(sb);
}
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+ int rc;
+ sigset_t blocked;
+
+ sigfillset(&blocked);
+ rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+ BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+ int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+ BUG_ON(rc);
+}
+
module_init(ocfs2_init);
module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f527..40c7de0 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e77730..98ee6c4 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
struct ocfs2_alloc_context *meta_ac;
struct ocfs2_alloc_context *data_ac;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ int set_abort;
};
#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
goto leave;
}
- status = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, vb->vb_bh);
clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
}
le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-
- ret = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, vb->vb_bh);
if (ext_flags & OCFS2_EXT_REFCOUNTED)
ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
memset(bh->b_data + cp_len, 0,
blocksize - cp_len);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
bh = NULL;
@@ -2148,15 +2136,19 @@ alloc_value:
orig_clusters = ocfs2_xa_value_clusters(loc);
rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
if (rc < 0) {
- /*
- * If we tried to grow an existing external value,
- * ocfs2_xa_cleanuP-value_truncate() is going to
- * let it stand. We have to restore its original
- * value size.
- */
- loc->xl_entry->xe_value_size = orig_value_size;
+ ctxt->set_abort = 1;
ocfs2_xa_cleanup_value_truncate(loc, "growing",
orig_clusters);
+ /*
+ * If we were growing an existing value,
+ * ocfs2_xa_cleanup_value_truncate() won't remove
+ * the entry. We need to restore the original value
+ * size.
+ */
+ if (loc->xl_entry) {
+ BUG_ON(!orig_value_size);
+ loc->xl_entry->xe_value_size = orig_value_size;
+ }
mlog_errno(rc);
}
}
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
blk = le64_to_cpu(xb->xb_blkno);
bit = le16_to_cpu(xb->xb_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (xb->xb_suballoc_loc)
+ bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
xb_alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
- ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(ctxt->handle, di_bh);
out:
return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
int ret;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *new_bh = NULL;
struct ocfs2_xattr_block *xblk;
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
goto end;
}
- ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
- &suballoc_bit_start, &num_got,
- &first_blkno);
+ ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+ &suballoc_loc, &suballoc_bit_start,
+ &num_got, &first_blkno);
if (ret < 0) {
mlog_errno(ret);
goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
memset(xblk, 0, inode->i_sb->s_blocksize);
strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+ xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
- xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+ xblk->xb_fs_generation =
+ cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
xblk->xb_blkno = cpu_to_le64(first_blkno);
if (indexed) {
struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
ret = ocfs2_xa_set(&loc, xi, ctxt);
if (!ret)
xs->here = loc.xl_entry;
- else if (ret != -ENOSPC)
+ else if ((ret != -ENOSPC) || ctxt->set_abort)
goto end;
else {
ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
- } else if (ret == -ENOSPC) {
+ } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
if (di->i_xattr_loc && !xbs->xattr_bh) {
ret = ocfs2_xattr_block_find(inode,
xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
u32 bit_off, len;
u64 blkno;
handle_t *handle = ctxt->handle;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *xb_bh = xs->xattr_bh;
struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
goto out;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
1, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
* We need to update the first bucket of the old extent and all
* the buckets going to the new extent.
*/
- credits = ((num_buckets + 1) * blks_per_bucket) +
- handle->h_buffer_credits;
+ credits = ((num_buckets + 1) * blks_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
u32 *first_hash)
{
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+ int ret, credits = 2 * blk_per_bucket;
BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
goto leave;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
goto leave;
}
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
leave:
return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
* existing bucket. Then we add the last existing bucket, the
* new bucket, and the first bucket (3 * blk_per_bucket).
*/
- credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
- handle->h_buffer_credits;
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
}
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
-
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, root_bh);
ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
if (ret)
@@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
goto out;
}
- ret = ocfs2_claim_clusters(osb, handle, data_ac,
+ ret = ocfs2_claim_clusters(handle, data_ac,
len, &p_cluster, &num_clusters);
if (ret) {
mlog_errno(ret);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index c82af6a..b44bb83 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,7 +3,6 @@
* Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
* Released under GPL v2.
*/
-#include <linux/version.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8418fcc..c7f9f23 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -730,6 +730,7 @@ out_no_task:
static const struct file_operations proc_info_file_operations = {
.read = proc_info_read,
+ .llseek = generic_file_llseek,
};
static int proc_single_show(struct seq_file *m, void *v)
@@ -987,6 +988,7 @@ out_no_task:
static const struct file_operations proc_environ_operations = {
.read = environ_read,
+ .llseek = generic_file_llseek,
};
static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1060,6 +1062,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
static const struct file_operations proc_oom_adjust_operations = {
.read = oom_adjust_read,
.write = oom_adjust_write,
+ .llseek = generic_file_llseek,
};
#ifdef CONFIG_AUDITSYSCALL
@@ -1131,6 +1134,7 @@ out_free_page:
static const struct file_operations proc_loginuid_operations = {
.read = proc_loginuid_read,
.write = proc_loginuid_write,
+ .llseek = generic_file_llseek,
};
static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1151,6 +1155,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
static const struct file_operations proc_sessionid_operations = {
.read = proc_sessionid_read,
+ .llseek = generic_file_llseek,
};
#endif
@@ -1202,6 +1207,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
static const struct file_operations proc_fault_inject_operations = {
.read = proc_fault_inject_read,
.write = proc_fault_inject_write,
+ .llseek = generic_file_llseek,
};
#endif
@@ -1943,7 +1949,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
}
static const struct file_operations proc_fdinfo_file_operations = {
- .open = nonseekable_open,
+ .open = nonseekable_open,
.read = proc_fdinfo_read,
};
@@ -2227,6 +2233,7 @@ out_no_task:
static const struct file_operations proc_pid_attr_operations = {
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
+ .llseek = generic_file_llseek,
};
static const struct pid_entry attr_dir_stuff[] = {
@@ -2347,6 +2354,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
static const struct file_operations proc_coredump_filter_operations = {
.read = proc_coredump_filter_read,
.write = proc_coredump_filter_write,
+ .llseek = generic_file_llseek,
};
#endif
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d35b232..aea8502 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -232,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
if (rv == -ENOIOCTLCMD)
rv = -EINVAL;
} else if (ioctl) {
- lock_kernel();
+ WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
+ "%pf will be called without the Bkl held\n", ioctl);
rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
- unlock_kernel();
}
pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 19979a2..c837a77 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
+ .llseek = generic_file_llseek,
};
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a4..bd4b5a7 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
.poll = kmsg_poll,
.open = kmsg_open,
.release = kmsg_release,
+ .llseek = generic_file_llseek,
};
static int __init proc_kmsg_init(void)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9fbc99e..91c817f 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
static const struct file_operations proc_vmcore_operations = {
.read = read_vmcore,
+ .llseek = generic_file_llseek,
};
static struct vmcore* __init get_new_element(void)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c948534..f47cd21 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -214,7 +214,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
return 0;
}
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ramfs_fs_info *fsi;
struct inode *inode = NULL;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d2935..4e321f7 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
};
static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
{
- struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+ struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
rc = -EIO;
if (attr->read)
- rc = attr->read(kobj, attr, buffer, off, count);
+ rc = attr->read(file, kobj, attr, buffer, off, count);
sysfs_put_active(attr_sd);
@@ -70,8 +70,7 @@ static ssize_t
read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
{
struct bin_buffer *bb = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- int size = dentry->d_inode->i_size;
+ int size = file->f_path.dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
mutex_lock(&bb->mutex);
- count = fill_read(dentry, bb->buffer, offs, count);
+ count = fill_read(file, bb->buffer, offs, count);
if (count < 0) {
mutex_unlock(&bb->mutex);
goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
}
static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
{
- struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+ struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
rc = -EIO;
if (attr->write)
- rc = attr->write(kobj, attr, buffer, offset, count);
+ rc = attr->write(file, kobj, attr, buffer, offset, count);
sysfs_put_active(attr_sd);
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
size_t bytes, loff_t *off)
{
struct bin_buffer *bb = file->private_data;
- struct dentry *dentry = file->f_path.dentry;
- int size = dentry->d_inode->i_size;
+ int size = file->f_path.dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
memcpy(bb->buffer, temp, count);
- count = flush_write(dentry, bb->buffer, offs, count);
+ count = flush_write(file, bb->buffer, offs, count);
mutex_unlock(&bb->mutex);
if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
if (!attr->mmap)
goto out_put;
- rc = attr->mmap(kobj, attr, vma);
+ rc = attr->mmap(file, kobj, attr, vma);
if (rc)
goto out_put;
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
void sysfs_remove_bin_file(struct kobject *kobj,
const struct bin_attribute *attr)
{
- sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+ sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5907178..7e54bac 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
{
struct sysfs_inode_attrs *ps_iattr;
- if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+ if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
return -EEXIST;
sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
* Pointer to sysfs_dirent if found, NULL if not.
*/
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+ const void *ns,
const unsigned char *name)
{
struct sysfs_dirent *sd;
- for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+ for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+ if (ns && sd->s_ns && (sd->s_ns != ns))
+ continue;
if (!strcmp(sd->s_name, name))
return sd;
+ }
return NULL;
}
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
* Pointer to sysfs_dirent if found, NULL if not.
*/
struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+ const void *ns,
const unsigned char *name)
{
struct sysfs_dirent *sd;
mutex_lock(&sysfs_mutex);
- sd = sysfs_find_dirent(parent_sd, name);
+ sd = sysfs_find_dirent(parent_sd, ns, name);
sysfs_get(sd);
mutex_unlock(&sysfs_mutex);
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
EXPORT_SYMBOL_GPL(sysfs_get_dirent);
static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
- const char *name, struct sysfs_dirent **p_sd)
+ enum kobj_ns_type type, const void *ns, const char *name,
+ struct sysfs_dirent **p_sd)
{
umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
if (!sd)
return -ENOMEM;
+
+ sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+ sd->s_ns = ns;
sd->s_dir.kobj = kobj;
/* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
int sysfs_create_subdir(struct kobject *kobj, const char *name,
struct sysfs_dirent **p_sd)
{
- return create_dir(kobj, kobj->sd, name, p_sd);
+ return create_dir(kobj, kobj->sd,
+ KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+
+/**
+ * sysfs_read_ns_type: return associated ns_type
+ * @kobj: the kobject being queried
+ *
+ * Each kobject can be tagged with exactly one namespace type
+ * (i.e. network or user). Return the ns_type associated with
+ * this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+ const struct kobj_ns_type_operations *ops;
+ enum kobj_ns_type type;
+
+ ops = kobj_child_ns_ops(kobj);
+ if (!ops)
+ return KOBJ_NS_TYPE_NONE;
+
+ type = ops->type;
+ BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+ BUG_ON(type >= KOBJ_NS_TYPES);
+ BUG_ON(!kobj_ns_type_registered(type));
+
+ return type;
}
/**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
*/
int sysfs_create_dir(struct kobject * kobj)
{
+ enum kobj_ns_type type;
struct sysfs_dirent *parent_sd, *sd;
+ const void *ns = NULL;
int error = 0;
BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
else
parent_sd = &sysfs_root;
- error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+ if (sysfs_ns_type(parent_sd))
+ ns = kobj->ktype->namespace(kobj);
+ type = sysfs_read_ns_type(kobj);
+
+ error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
if (!error)
kobj->sd = sd;
return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct dentry *ret = NULL;
- struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+ struct dentry *parent = dentry->d_parent;
+ struct sysfs_dirent *parent_sd = parent->d_fsdata;
struct sysfs_dirent *sd;
struct inode *inode;
+ enum kobj_ns_type type;
+ const void *ns;
mutex_lock(&sysfs_mutex);
- sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+ type = sysfs_ns_type(parent_sd);
+ ns = sysfs_info(dir->i_sb)->ns[type];
+
+ sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
/* no such entry */
if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
}
int sysfs_rename(struct sysfs_dirent *sd,
- struct sysfs_dirent *new_parent_sd, const char *new_name)
+ struct sysfs_dirent *new_parent_sd, const void *new_ns,
+ const char *new_name)
{
const char *dup_name = NULL;
int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
mutex_lock(&sysfs_mutex);
error = 0;
- if ((sd->s_parent == new_parent_sd) &&
+ if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
(strcmp(sd->s_name, new_name) == 0))
goto out; /* nothing to rename */
error = -EEXIST;
- if (sysfs_find_dirent(new_parent_sd, new_name))
+ if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
goto out;
/* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
sd->s_parent = new_parent_sd;
sysfs_link_sibling(sd);
}
+ sd->s_ns = new_ns;
error = 0;
out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
{
- return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+ struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+ const void *new_ns = NULL;
+
+ if (sysfs_ns_type(parent_sd))
+ new_ns = kobj->ktype->namespace(kobj);
+
+ return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
}
int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
{
struct sysfs_dirent *sd = kobj->sd;
struct sysfs_dirent *new_parent_sd;
+ const void *new_ns = NULL;
BUG_ON(!sd->s_parent);
+ if (sysfs_ns_type(sd->s_parent))
+ new_ns = kobj->ktype->namespace(kobj);
new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
new_parent_kobj->sd : &sysfs_root;
- return sysfs_rename(sd, new_parent_sd, sd->s_name);
+ return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
}
/* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
return 0;
}
-static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
- ino_t ino, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
+ struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
{
if (pos) {
int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
pos->s_parent == parent_sd &&
ino == pos->s_ino;
sysfs_put(pos);
- if (valid)
- return pos;
+ if (!valid)
+ pos = NULL;
}
- pos = NULL;
- if ((ino > 1) && (ino < INT_MAX)) {
+ if (!pos && (ino > 1) && (ino < INT_MAX)) {
pos = parent_sd->s_dir.children;
while (pos && (ino > pos->s_ino))
pos = pos->s_sibling;
}
+ while (pos && pos->s_ns && pos->s_ns != ns)
+ pos = pos->s_sibling;
return pos;
}
-static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
- ino_t ino, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
+ struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
{
- pos = sysfs_dir_pos(parent_sd, ino, pos);
+ pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
if (pos)
pos = pos->s_sibling;
+ while (pos && pos->s_ns && pos->s_ns != ns)
+ pos = pos->s_sibling;
return pos;
}
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
struct dentry *dentry = filp->f_path.dentry;
struct sysfs_dirent * parent_sd = dentry->d_fsdata;
struct sysfs_dirent *pos = filp->private_data;
+ enum kobj_ns_type type;
+ const void *ns;
ino_t ino;
+ type = sysfs_ns_type(parent_sd);
+ ns = sysfs_info(dentry->d_sb)->ns[type];
+
if (filp->f_pos == 0) {
ino = parent_sd->s_ino;
if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
filp->f_pos++;
}
mutex_lock(&sysfs_mutex);
- for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+ for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
pos;
- pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
+ pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
const char * name;
unsigned int type;
int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b25..1beaa73 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
mutex_lock(&sysfs_mutex);
if (sd && dir)
- sd = sysfs_find_dirent(sd, dir);
+ /* Only directories are tagged, so no need to pass
+ * a tag explicitly.
+ */
+ sd = sysfs_find_dirent(sd, NULL, dir);
if (sd && attr)
- sd = sysfs_find_dirent(sd, attr);
+ sd = sysfs_find_dirent(sd, NULL, attr);
if (sd)
sysfs_notify_dirent(sd);
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
int error;
if (group)
- dir_sd = sysfs_get_dirent(kobj->sd, group);
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
else
dir_sd = sysfs_get(kobj->sd);
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
mutex_lock(&sysfs_mutex);
rc = -ENOENT;
- sd = sysfs_find_dirent(kobj->sd, attr->name);
+ sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
if (!sd)
goto out;
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
{
- sysfs_hash_and_remove(kobj->sd, attr->name);
+ sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
}
void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
struct sysfs_dirent *dir_sd;
if (group)
- dir_sd = sysfs_get_dirent(kobj->sd, group);
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
else
dir_sd = sysfs_get(kobj->sd);
if (dir_sd) {
- sysfs_hash_and_remove(dir_sd, attr->name);
+ sysfs_hash_and_remove(dir_sd, NULL, attr->name);
sysfs_put(dir_sd);
}
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe61194..23c1e59 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
int i;
for (i = 0, attr = grp->attrs; *attr; i++, attr++)
- sysfs_hash_and_remove(dir_sd, (*attr)->name);
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
}
static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
* visibility. Do this by first removing then
* re-adding (if required) the file */
if (update)
- sysfs_hash_and_remove(dir_sd, (*attr)->name);
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
if (grp->is_visible) {
mode = grp->is_visible(kobj, *attr, i);
if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
struct sysfs_dirent *sd;
if (grp->name) {
- sd = sysfs_get_dirent(dir_sd, grp->name);
+ sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
if (!sd) {
WARN(!sd, KERN_WARNING "sysfs group %p not found for "
"kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a94..bbd77e9 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
sysfs_put(sd);
}
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
{
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
sysfs_addrm_start(&acxt, dir_sd);
- sd = sysfs_find_dirent(dir_sd, name);
+ sd = sysfs_find_dirent(dir_sd, ns, name);
+ if (sd && (sd->s_ns != ns))
+ sd = NULL;
if (sd)
sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 7761378..281c0c9 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
struct sysfs_dirent sysfs_root = {
.s_name = "",
.s_count = ATOMIC_INIT(1),
- .s_flags = SYSFS_DIR,
+ .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
.s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
.s_ino = 1,
};
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+ struct sysfs_super_info *sb_info = sysfs_info(sb);
+ struct sysfs_super_info *info = data;
+ enum kobj_ns_type type;
+ int found = 1;
+
+ for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+ if (sb_info->ns[type] != info->ns[type])
+ found = 0;
+ }
+ return found;
+}
+
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+ int error;
+ error = set_anon_super(sb, data);
+ if (!error)
+ sb->s_fs_info = data;
+ return error;
+}
+
static int sysfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
- return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+ struct sysfs_super_info *info;
+ enum kobj_ns_type type;
+ struct super_block *sb;
+ int error;
+
+ error = -ENOMEM;
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ goto out;
+
+ for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+ info->ns[type] = kobj_ns_current(type);
+
+ sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+ if (IS_ERR(sb) || sb->s_fs_info != info)
+ kfree(info);
+ if (IS_ERR(sb)) {
+ error = PTR_ERR(sb);
+ goto out;
+ }
+ if (!sb->s_root) {
+ sb->s_flags = flags;
+ error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ deactivate_locked_super(sb);
+ goto out;
+ }
+ sb->s_flags |= MS_ACTIVE;
+ }
+
+ simple_set_mnt(mnt, sb);
+ error = 0;
+out:
+ return error;
+}
+
+static void sysfs_kill_sb(struct super_block *sb)
+{
+ struct sysfs_super_info *info = sysfs_info(sb);
+
+ /* Remove the superblock from fs_supers/s_instances
+ * so we can't find it, before freeing sysfs_super_info.
+ */
+ kill_anon_super(sb);
+ kfree(info);
}
static struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.get_sb = sysfs_get_sb,
- .kill_sb = kill_anon_super,
+ .kill_sb = sysfs_kill_sb,
};
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+ struct super_block *sb;
+
+ mutex_lock(&sysfs_mutex);
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+ struct sysfs_super_info *info = sysfs_info(sb);
+ /*
+ * If we see a superblock on the fs_supers/s_instances
+ * list the unmount has not completed and sb->s_fs_info
+ * points to a valid struct sysfs_super_info.
+ */
+ /* Ignore superblocks with the wrong ns */
+ if (info->ns[type] != ns)
+ continue;
+ info->ns[type] = NULL;
+ }
+ spin_unlock(&sb_lock);
+ mutex_unlock(&sysfs_mutex);
+}
+
int __init sysfs_init(void)
{
int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b93ec51..f71246b 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
if (!sd)
goto out_put;
+ if (sysfs_ns_type(parent_sd))
+ sd->s_ns = target->ktype->namespace(target);
sd->s_symlink.target_sd = target_sd;
target_sd = NULL; /* reference is now owned by the symlink */
@@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
}
/**
+ * sysfs_delete_link - remove symlink in object's directory.
+ * @kobj: object we're acting for.
+ * @targ: object we're pointing to.
+ * @name: name of the symlink to remove.
+ *
+ * Unlike sysfs_remove_link sysfs_delete_link has enough information
+ * to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+ const char *name)
+{
+ const void *ns = NULL;
+ spin_lock(&sysfs_assoc_lock);
+ if (targ->sd)
+ ns = targ->sd->s_ns;
+ spin_unlock(&sysfs_assoc_lock);
+ sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+
+/**
* sysfs_remove_link - remove symlink in object's directory.
* @kobj: object we're acting for.
* @name: name of the symlink to remove.
@@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
else
parent_sd = kobj->sd;
- sysfs_hash_and_remove(parent_sd, name);
+ sysfs_hash_and_remove(parent_sd, NULL, name);
}
/**
@@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
const char *old, const char *new)
{
struct sysfs_dirent *parent_sd, *sd = NULL;
+ const void *old_ns = NULL, *new_ns = NULL;
int result;
if (!kobj)
@@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
else
parent_sd = kobj->sd;
+ if (targ->sd)
+ old_ns = targ->sd->s_ns;
+
result = -ENOENT;
- sd = sysfs_get_dirent(parent_sd, old);
+ sd = sysfs_get_dirent(parent_sd, old_ns, old);
if (!sd)
goto out;
@@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
if (sd->s_symlink.target_sd->s_dir.kobj != targ)
goto out;
- result = sysfs_rename(sd, parent_sd, new);
+ if (sysfs_ns_type(parent_sd))
+ new_ns = targ->ktype->namespace(targ);
+
+ result = sysfs_rename(sd, parent_sd, new_ns, new);
out:
sysfs_put(sd);
@@ -261,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
EXPORT_SYMBOL_GPL(sysfs_create_link);
EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44..6a13105 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
struct sysfs_dirent *s_sibling;
const char *s_name;
+ const void *s_ns; /* namespace tag */
union {
struct sysfs_elem_dir s_dir;
struct sysfs_elem_symlink s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED 0x0200
+/* identify any namespace tag on sysfs_dirents */
+#define SYSFS_NS_TYPE_MASK 0xff00
+#define SYSFS_NS_TYPE_SHIFT 8
+
+#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED 0x020000
static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
{
return sd->s_flags & SYSFS_TYPE_MASK;
}
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+ return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_dirent_init_lockdep(sd) \
do { \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
/*
* mount.c
*/
+
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+ const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
extern struct sysfs_dirent sysfs_root;
extern struct kmem_cache *sysfs_dir_cachep;
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+ const void *ns,
const unsigned char *name);
struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+ const void *ns,
const unsigned char *name);
struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
void sysfs_remove_subdir(struct sysfs_dirent *sd);
int sysfs_rename(struct sysfs_dirent *sd,
- struct sysfs_dirent *new_parent_sd, const char *new_name);
+ struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
{
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
int sysfs_inode_init(void);
/*
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 98158de..b86ab8e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
struct timerfd_ctx *ctx = file->private_data;
ssize_t res;
u64 ticks = 0;
- DECLARE_WAITQUEUE(wait, current);
if (count < sizeof(ticks))
return -EINVAL;
spin_lock_irq(&ctx->wqh.lock);
- res = -EAGAIN;
- if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
- __add_wait_queue(&ctx->wqh, &wait);
- for (res = 0;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (ctx->ticks) {
- res = 0;
- break;
- }
- if (signal_pending(current)) {
- res = -ERESTARTSYS;
- break;
- }
- spin_unlock_irq(&ctx->wqh.lock);
- schedule();
- spin_lock_irq(&ctx->wqh.lock);
- }
- __remove_wait_queue(&ctx->wqh, &wait);
- __set_current_state(TASK_RUNNING);
- }
+ if (file->f_flags & O_NONBLOCK)
+ res = -EAGAIN;
+ else
+ res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
if (ctx->ticks) {
ticks = ctx->ticks;
if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 77d5cf4..bcf5a16 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
if (!c->ro_media) {
c->ro_media = 1;
c->no_chk_data_crc = 0;
+ c->vfs_sb->s_flags |= MS_RDONLY;
ubifs_warn("switched to read-only mode, error %d", err);
dbg_dump_stack();
}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0f8b996..089eaca 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -45,6 +45,15 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+ IO_READ, /* mapping for a read */
+ IO_DELAY, /* mapping covers delalloc region */
+ IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
+ IO_NEW /* just allocated */
+};
/*
* Prime number of hash buckets since address is used as the key.
@@ -103,8 +112,9 @@ xfs_count_page_state(
STATIC struct block_device *
xfs_find_bdev_for_inode(
- struct xfs_inode *ip)
+ struct inode *inode)
{
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
if (XFS_IS_REALTIME_INODE(ip))
@@ -183,7 +193,7 @@ xfs_setfilesize(
xfs_fsize_t isize;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
- ASSERT(ioend->io_type != IOMAP_READ);
+ ASSERT(ioend->io_type != IO_READ);
if (unlikely(ioend->io_error))
return 0;
@@ -214,7 +224,7 @@ xfs_finish_ioend(
if (atomic_dec_and_test(&ioend->io_remaining)) {
struct workqueue_struct *wq;
- wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+ wq = (ioend->io_type == IO_UNWRITTEN) ?
xfsconvertd_workqueue : xfsdatad_workqueue;
queue_work(wq, &ioend->io_work);
if (wait)
@@ -237,7 +247,7 @@ xfs_end_io(
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
*/
- if (ioend->io_type == IOMAP_UNWRITTEN &&
+ if (ioend->io_type == IO_UNWRITTEN &&
likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -250,7 +260,7 @@ xfs_end_io(
* We might have to update the on-disk file size after extending
* writes.
*/
- if (ioend->io_type != IOMAP_READ) {
+ if (ioend->io_type != IO_READ) {
error = xfs_setfilesize(ioend);
ASSERT(!error || error == EAGAIN);
}
@@ -309,21 +319,25 @@ xfs_map_blocks(
struct inode *inode,
loff_t offset,
ssize_t count,
- xfs_iomap_t *mapp,
+ struct xfs_bmbt_irec *imap,
int flags)
{
int nmaps = 1;
+ int new = 0;
- return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+ return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
}
STATIC int
-xfs_iomap_valid(
- xfs_iomap_t *iomapp,
- loff_t offset)
+xfs_imap_valid(
+ struct inode *inode,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
{
- return offset >= iomapp->iomap_offset &&
- offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+ offset >>= inode->i_blkbits;
+
+ return offset >= imap->br_startoff &&
+ offset < imap->br_startoff + imap->br_blockcount;
}
/*
@@ -554,19 +568,23 @@ xfs_add_to_ioend(
STATIC void
xfs_map_buffer(
+ struct inode *inode,
struct buffer_head *bh,
- xfs_iomap_t *mp,
- xfs_off_t offset,
- uint block_bits)
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
{
sector_t bn;
+ struct xfs_mount *m = XFS_I(inode)->i_mount;
+ xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+ xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
- ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
- bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
- ((offset - mp->iomap_offset) >> block_bits);
+ bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+ ((offset - iomap_offset) >> inode->i_blkbits);
- ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+ ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
bh->b_blocknr = bn;
set_buffer_mapped(bh);
@@ -574,17 +592,17 @@ xfs_map_buffer(
STATIC void
xfs_map_at_offset(
+ struct inode *inode,
struct buffer_head *bh,
- loff_t offset,
- int block_bits,
- xfs_iomap_t *iomapp)
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
{
- ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
- ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
lock_buffer(bh);
- xfs_map_buffer(bh, iomapp, offset, block_bits);
- bh->b_bdev = iomapp->iomap_target->bt_bdev;
+ xfs_map_buffer(inode, bh, imap, offset);
+ bh->b_bdev = xfs_find_bdev_for_inode(inode);
set_buffer_mapped(bh);
clear_buffer_delay(bh);
clear_buffer_unwritten(bh);
@@ -713,11 +731,11 @@ xfs_is_delayed_page(
bh = head = page_buffers(page);
do {
if (buffer_unwritten(bh))
- acceptable = (type == IOMAP_UNWRITTEN);
+ acceptable = (type == IO_UNWRITTEN);
else if (buffer_delay(bh))
- acceptable = (type == IOMAP_DELAY);
+ acceptable = (type == IO_DELAY);
else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable = (type == IOMAP_NEW);
+ acceptable = (type == IO_NEW);
else
break;
} while ((bh = bh->b_this_page) != head);
@@ -740,7 +758,7 @@ xfs_convert_page(
struct inode *inode,
struct page *page,
loff_t tindex,
- xfs_iomap_t *mp,
+ struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
int startio,
@@ -750,7 +768,6 @@ xfs_convert_page(
xfs_off_t end_offset;
unsigned long p_offset;
unsigned int type;
- int bbits = inode->i_blkbits;
int len, page_dirty;
int count = 0, done = 0, uptodate = 1;
xfs_off_t offset = page_offset(page);
@@ -802,19 +819,19 @@ xfs_convert_page(
if (buffer_unwritten(bh) || buffer_delay(bh)) {
if (buffer_unwritten(bh))
- type = IOMAP_UNWRITTEN;
+ type = IO_UNWRITTEN;
else
- type = IOMAP_DELAY;
+ type = IO_DELAY;
- if (!xfs_iomap_valid(mp, offset)) {
+ if (!xfs_imap_valid(inode, imap, offset)) {
done = 1;
continue;
}
- ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
- ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
- xfs_map_at_offset(bh, offset, bbits, mp);
+ xfs_map_at_offset(inode, bh, imap, offset);
if (startio) {
xfs_add_to_ioend(inode, bh, offset,
type, ioendp, done);
@@ -826,7 +843,7 @@ xfs_convert_page(
page_dirty--;
count++;
} else {
- type = IOMAP_NEW;
+ type = IO_NEW;
if (buffer_mapped(bh) && all_bh && startio) {
lock_buffer(bh);
xfs_add_to_ioend(inode, bh, offset,
@@ -866,7 +883,7 @@ STATIC void
xfs_cluster_write(
struct inode *inode,
pgoff_t tindex,
- xfs_iomap_t *iomapp,
+ struct xfs_bmbt_irec *imap,
xfs_ioend_t **ioendp,
struct writeback_control *wbc,
int startio,
@@ -885,7 +902,7 @@ xfs_cluster_write(
for (i = 0; i < pagevec_count(&pvec); i++) {
done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- iomapp, ioendp, wbc, startio, all_bh);
+ imap, ioendp, wbc, startio, all_bh);
if (done)
break;
}
@@ -930,7 +947,7 @@ xfs_aops_discard_page(
loff_t offset = page_offset(page);
ssize_t len = 1 << inode->i_blkbits;
- if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+ if (!xfs_is_delayed_page(page, IO_DELAY))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1042,15 +1059,15 @@ xfs_page_state_convert(
int unmapped) /* also implies page uptodate */
{
struct buffer_head *bh, *head;
- xfs_iomap_t iomap;
+ struct xfs_bmbt_irec imap;
xfs_ioend_t *ioend = NULL, *iohead = NULL;
loff_t offset;
unsigned long p_offset = 0;
unsigned int type;
__uint64_t end_offset;
- pgoff_t end_index, last_index, tlast;
+ pgoff_t end_index, last_index;
ssize_t size, len;
- int flags, err, iomap_valid = 0, uptodate = 1;
+ int flags, err, imap_valid = 0, uptodate = 1;
int page_dirty, count = 0;
int trylock = 0;
int all_bh = unmapped;
@@ -1097,7 +1114,7 @@ xfs_page_state_convert(
bh = head = page_buffers(page);
offset = page_offset(page);
flags = BMAPI_READ;
- type = IOMAP_NEW;
+ type = IO_NEW;
/* TODO: cleanup count and page_dirty */
@@ -1111,12 +1128,12 @@ xfs_page_state_convert(
* the iomap is actually still valid, but the ioend
* isn't. shouldn't happen too often.
*/
- iomap_valid = 0;
+ imap_valid = 0;
continue;
}
- if (iomap_valid)
- iomap_valid = xfs_iomap_valid(&iomap, offset);
+ if (imap_valid)
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
/*
* First case, map an unwritten extent and prepare for
@@ -1137,20 +1154,20 @@ xfs_page_state_convert(
* Make sure we don't use a read-only iomap
*/
if (flags == BMAPI_READ)
- iomap_valid = 0;
+ imap_valid = 0;
if (buffer_unwritten(bh)) {
- type = IOMAP_UNWRITTEN;
+ type = IO_UNWRITTEN;
flags = BMAPI_WRITE | BMAPI_IGNSTATE;
} else if (buffer_delay(bh)) {
- type = IOMAP_DELAY;
+ type = IO_DELAY;
flags = BMAPI_ALLOCATE | trylock;
} else {
- type = IOMAP_NEW;
+ type = IO_NEW;
flags = BMAPI_WRITE | BMAPI_MMAP;
}
- if (!iomap_valid) {
+ if (!imap_valid) {
/*
* if we didn't have a valid mapping then we
* need to ensure that we put the new mapping
@@ -1160,7 +1177,7 @@ xfs_page_state_convert(
* for unwritten extent conversion.
*/
new_ioend = 1;
- if (type == IOMAP_NEW) {
+ if (type == IO_NEW) {
size = xfs_probe_cluster(inode,
page, bh, head, 0);
} else {
@@ -1168,14 +1185,14 @@ xfs_page_state_convert(
}
err = xfs_map_blocks(inode, offset, size,
- &iomap, flags);
+ &imap, flags);
if (err)
goto error;
- iomap_valid = xfs_iomap_valid(&iomap, offset);
+ imap_valid = xfs_imap_valid(inode, &imap,
+ offset);
}
- if (iomap_valid) {
- xfs_map_at_offset(bh, offset,
- inode->i_blkbits, &iomap);
+ if (imap_valid) {
+ xfs_map_at_offset(inode, bh, &imap, offset);
if (startio) {
xfs_add_to_ioend(inode, bh, offset,
type, &ioend,
@@ -1194,40 +1211,41 @@ xfs_page_state_convert(
* That means it must already have extents allocated
* underneath it. Map the extent by reading it.
*/
- if (!iomap_valid || flags != BMAPI_READ) {
+ if (!imap_valid || flags != BMAPI_READ) {
flags = BMAPI_READ;
size = xfs_probe_cluster(inode, page, bh,
head, 1);
err = xfs_map_blocks(inode, offset, size,
- &iomap, flags);
+ &imap, flags);
if (err)
goto error;
- iomap_valid = xfs_iomap_valid(&iomap, offset);
+ imap_valid = xfs_imap_valid(inode, &imap,
+ offset);
}
/*
- * We set the type to IOMAP_NEW in case we are doing a
+ * We set the type to IO_NEW in case we are doing a
* small write at EOF that is extending the file but
* without needing an allocation. We need to update the
* file size on I/O completion in this case so it is
* the same case as having just allocated a new extent
* that we are writing into for the first time.
*/
- type = IOMAP_NEW;
+ type = IO_NEW;
if (trylock_buffer(bh)) {
ASSERT(buffer_mapped(bh));
- if (iomap_valid)
+ if (imap_valid)
all_bh = 1;
xfs_add_to_ioend(inode, bh, offset, type,
- &ioend, !iomap_valid);
+ &ioend, !imap_valid);
page_dirty--;
count++;
} else {
- iomap_valid = 0;
+ imap_valid = 0;
}
} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
(unmapped || startio)) {
- iomap_valid = 0;
+ imap_valid = 0;
}
if (!iohead)
@@ -1241,12 +1259,23 @@ xfs_page_state_convert(
if (startio)
xfs_start_page_writeback(page, 1, count);
- if (ioend && iomap_valid) {
- offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
- PAGE_CACHE_SHIFT;
- tlast = min_t(pgoff_t, offset, last_index);
- xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
- wbc, startio, all_bh, tlast);
+ if (ioend && imap_valid) {
+ xfs_off_t end_index;
+
+ end_index = imap.br_startoff + imap.br_blockcount;
+
+ /* to bytes */
+ end_index <<= inode->i_blkbits;
+
+ /* to pages */
+ end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+ /* check against file size */
+ if (end_index > last_index)
+ end_index = last_index;
+
+ xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+ wbc, startio, all_bh, end_index);
}
if (iohead)
@@ -1448,10 +1477,11 @@ __xfs_get_blocks(
int direct,
bmapi_flags_t flags)
{
- xfs_iomap_t iomap;
+ struct xfs_bmbt_irec imap;
xfs_off_t offset;
ssize_t size;
- int niomap = 1;
+ int nimap = 1;
+ int new = 0;
int error;
offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1462,22 +1492,21 @@ __xfs_get_blocks(
return 0;
error = xfs_iomap(XFS_I(inode), offset, size,
- create ? flags : BMAPI_READ, &iomap, &niomap);
+ create ? flags : BMAPI_READ, &imap, &nimap, &new);
if (error)
return -error;
- if (niomap == 0)
+ if (nimap == 0)
return 0;
- if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ imap.br_startblock != DELAYSTARTBLOCK) {
/*
* For unwritten extents do not report a disk address on
* the read case (treat as if we're reading into a hole).
*/
- if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
- xfs_map_buffer(bh_result, &iomap, offset,
- inode->i_blkbits);
- }
- if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+ if (create || !ISUNWRITTEN(&imap))
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (create && ISUNWRITTEN(&imap)) {
if (direct)
bh_result->b_private = inode;
set_buffer_unwritten(bh_result);
@@ -1488,7 +1517,7 @@ __xfs_get_blocks(
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
- bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+ bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
/*
* If we previously allocated a block out beyond eof and we are now
@@ -1502,10 +1531,10 @@ __xfs_get_blocks(
if (create &&
((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
(offset >= i_size_read(inode)) ||
- (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+ (new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
- if (iomap.iomap_flags & IOMAP_DELAY) {
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
BUG_ON(direct);
if (create) {
set_buffer_uptodate(bh_result);
@@ -1514,11 +1543,23 @@ __xfs_get_blocks(
}
}
+ /*
+ * If this is O_DIRECT or the mpage code calling tell them how large
+ * the mapping is, so that we can avoid repeated get_blocks calls.
+ */
if (direct || size > (1 << inode->i_blkbits)) {
- ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
- offset = min_t(xfs_off_t,
- iomap.iomap_bsize - iomap.iomap_delta, size);
- bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+ xfs_off_t mapping_size;
+
+ mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+ mapping_size <<= inode->i_blkbits;
+
+ ASSERT(mapping_size > 0);
+ if (mapping_size > size)
+ mapping_size = size;
+ if (mapping_size > LONG_MAX)
+ mapping_size = LONG_MAX;
+
+ bh_result->b_size = mapping_size;
}
return 0;
@@ -1576,7 +1617,7 @@ xfs_end_io_direct(
*/
ioend->io_offset = offset;
ioend->io_size = size;
- if (ioend->io_type == IOMAP_READ) {
+ if (ioend->io_type == IO_READ) {
xfs_finish_ioend(ioend, 0);
} else if (private && size > 0) {
xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1587,7 +1628,7 @@ xfs_end_io_direct(
* didn't map an unwritten extent so switch it's completion
* handler.
*/
- ioend->io_type = IOMAP_NEW;
+ ioend->io_type = IO_NEW;
xfs_finish_ioend(ioend, 0);
}
@@ -1612,10 +1653,10 @@ xfs_vm_direct_IO(
struct block_device *bdev;
ssize_t ret;
- bdev = xfs_find_bdev_for_inode(XFS_I(inode));
+ bdev = xfs_find_bdev_for_inode(inode);
iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
- IOMAP_UNWRITTEN : IOMAP_READ);
+ IO_UNWRITTEN : IO_READ);
ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44c2b0e..f01de3c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1007,25 +1007,20 @@ xfs_bwrite(
struct xfs_mount *mp,
struct xfs_buf *bp)
{
- int iowait = (bp->b_flags & XBF_ASYNC) == 0;
- int error = 0;
+ int error;
bp->b_strat = xfs_bdstrat_cb;
bp->b_mount = mp;
bp->b_flags |= XBF_WRITE;
- if (!iowait)
- bp->b_flags |= _XBF_RUN_QUEUES;
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
xfs_buf_delwri_dequeue(bp);
xfs_buf_iostrategy(bp);
- if (iowait) {
- error = xfs_buf_iowait(bp);
- if (error)
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- xfs_buf_relse(bp);
- }
-
+ error = xfs_buf_iowait(bp);
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_buf_relse(bp);
return error;
}
@@ -1614,7 +1609,8 @@ xfs_mapping_buftarg(
STATIC int
xfs_alloc_delwrite_queue(
- xfs_buftarg_t *btp)
+ xfs_buftarg_t *btp,
+ const char *fsname)
{
int error = 0;
@@ -1622,7 +1618,7 @@ xfs_alloc_delwrite_queue(
INIT_LIST_HEAD(&btp->bt_delwrite_queue);
spin_lock_init(&btp->bt_delwrite_lock);
btp->bt_flags = 0;
- btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+ btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
if (IS_ERR(btp->bt_task)) {
error = PTR_ERR(btp->bt_task);
goto out_error;
@@ -1635,7 +1631,8 @@ out_error:
xfs_buftarg_t *
xfs_alloc_buftarg(
struct block_device *bdev,
- int external)
+ int external,
+ const char *fsname)
{
xfs_buftarg_t *btp;
@@ -1647,7 +1644,7 @@ xfs_alloc_buftarg(
goto error;
if (xfs_mapping_buftarg(btp, bdev))
goto error;
- if (xfs_alloc_delwrite_queue(btp))
+ if (xfs_alloc_delwrite_queue(btp, fsname))
goto error;
xfs_alloc_bufhash(btp, external);
return btp;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e736..5fbecef 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
/*
* Handling of buftargs.
*/
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bc..d8fb1b5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -115,6 +115,8 @@ xfs_file_fsync(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ xfs_ioend_wait(ip);
+
/*
* We always need to make sure that the required inode state is safe on
* disk. The inode might be clean but we still might need to force the
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7b26cc2..699b60c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -527,6 +527,10 @@ xfs_attrmulti_by_handle(
if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
return -XFS_ERROR(EFAULT);
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+ return -E2BIG;
+
dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 593c05b..9287135 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
return -XFS_ERROR(EFAULT);
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+ return -E2BIG;
+
dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e65a793..9c8019c 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,7 +673,10 @@ xfs_vn_fiemap(
bm.bmv_length = BTOBB(length);
/* We add one because in getbmap world count includes the header */
- bm.bmv_count = fieinfo->fi_extents_max + 1;
+ bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+ fieinfo->fi_extents_max + 1;
+ bm.bmv_count = min_t(__s32, bm.bmv_count,
+ (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
bm.bmv_iflags = BMV_IF_PREALLOC;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 29f1edc..e900251 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -789,18 +789,18 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+ mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+ mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -902,7 +902,8 @@ xfsaild_start(
struct xfs_ail *ailp)
{
ailp->xa_target = 0;
- ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+ ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+ ailp->xa_mount->m_fsname);
if (IS_ERR(ailp->xa_task))
return -PTR_ERR(ailp->xa_task);
return 0;
@@ -1092,6 +1093,7 @@ xfs_fs_write_inode(
* the code will only flush the inode if it isn't already
* being flushed.
*/
+ xfs_ioend_wait(ip);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (ip->i_update_core) {
error = xfs_log_inode(ip);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a427c63..3884e20 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -356,68 +356,23 @@ xfs_commit_dummy_trans(
STATIC int
xfs_sync_fsdata(
- struct xfs_mount *mp,
- int flags)
+ struct xfs_mount *mp)
{
struct xfs_buf *bp;
- struct xfs_buf_log_item *bip;
- int error = 0;
/*
- * If this is xfssyncd() then only sync the superblock if we can
- * lock it without sleeping and it is not pinned.
+ * If the buffer is pinned then push on the log so we won't get stuck
+ * waiting in the write for someone, maybe ourselves, to flush the log.
+ *
+ * Even though we just pushed the log above, we did not have the
+ * superblock buffer locked at that point so it can become pinned in
+ * between there and here.
*/
- if (flags & SYNC_TRYLOCK) {
- ASSERT(!(flags & SYNC_WAIT));
-
- bp = xfs_getsb(mp, XBF_TRYLOCK);
- if (!bp)
- goto out;
-
- bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
- if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
- goto out_brelse;
- } else {
- bp = xfs_getsb(mp, 0);
-
- /*
- * If the buffer is pinned then push on the log so we won't
- * get stuck waiting in the write for someone, maybe
- * ourselves, to flush the log.
- *
- * Even though we just pushed the log above, we did not have
- * the superblock buffer locked at that point so it can
- * become pinned in between there and here.
- */
- if (XFS_BUF_ISPINNED(bp))
- xfs_log_force(mp, 0);
- }
-
-
- if (flags & SYNC_WAIT)
- XFS_BUF_UNASYNC(bp);
- else
- XFS_BUF_ASYNC(bp);
-
- error = xfs_bwrite(mp, bp);
- if (error)
- return error;
-
- /*
- * If this is a data integrity sync make sure all pending buffers
- * are flushed out for the log coverage check below.
- */
- if (flags & SYNC_WAIT)
- xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
- if (xfs_log_need_covered(mp))
- error = xfs_commit_dummy_trans(mp, flags);
- return error;
+ bp = xfs_getsb(mp, 0);
+ if (XFS_BUF_ISPINNED(bp))
+ xfs_log_force(mp, 0);
- out_brelse:
- xfs_buf_relse(bp);
- out:
- return error;
+ return xfs_bwrite(mp, bp);
}
/*
@@ -441,7 +396,7 @@ int
xfs_quiesce_data(
struct xfs_mount *mp)
{
- int error;
+ int error, error2 = 0;
/* push non-blocking */
xfs_sync_data(mp, 0);
@@ -452,13 +407,20 @@ xfs_quiesce_data(
xfs_qm_sync(mp, SYNC_WAIT);
/* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp, SYNC_WAIT);
+ error = xfs_sync_fsdata(mp);
+
+ /* make sure all delwri buffers are written out */
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+ /* mark the log as covered if needed */
+ if (xfs_log_need_covered(mp))
+ error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
/* flush data-only devices */
if (mp->m_rtdev_targp)
XFS_bflush(mp->m_rtdev_targp);
- return error;
+ return error ? error : error2;
}
STATIC void
@@ -581,9 +543,9 @@ xfs_flush_inodes(
}
/*
- * Every sync period we need to unpin all items, reclaim inodes, sync
- * quota and write out the superblock. We might need to cover the log
- * to indicate it is idle.
+ * Every sync period we need to unpin all items, reclaim inodes and sync
+ * disk quotas. We might need to cover the log to indicate that the
+ * filesystem is idle.
*/
STATIC void
xfs_sync_worker(
@@ -597,7 +559,8 @@ xfs_sync_worker(
xfs_reclaim_inodes(mp, 0);
/* dgc: errors ignored here */
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
- error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+ if (xfs_log_need_covered(mp))
+ error = xfs_commit_dummy_trans(mp, 0);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
@@ -660,7 +623,7 @@ xfs_syncd_init(
mp->m_sync_work.w_syncer = xfs_sync_worker;
mp->m_sync_work.w_mount = mp;
mp->m_sync_work.w_completion = NULL;
- mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+ mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
if (IS_ERR(mp->m_sync_task))
return -PTR_ERR(mp->m_sync_task);
return 0;
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a10760..207fa77 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
-#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
#include "xfs_aops.h"
#include "quota/xfs_dquot_item.h"
#include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f..8a319cf 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
struct xfs_dquot;
struct xlog_ticket;
struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(int, count)
+ __field(int, pincount)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->count = atomic_read(&VFS_I(ip)->i_count);
+ __entry->pincount = atomic_read(&ip->i_pincount);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->count,
+ __entry->pincount,
(char *)__entry->caller_ip)
)
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
TP_ARGS(ip, caller_ip))
DEFINE_INODE_EVENT(xfs_ihold);
DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
+
/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
DEFINE_INODE_EVENT(xfs_inode);
#define xfs_itrace_entry(ip) \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
TP_PROTO(struct xfs_dquot *dqp), \
TP_ARGS(dqp))
DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1495,6 +1503,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+ TP_PROTO(struct log *log, struct xlog_recover *trans,
+ struct xlog_recover_item *item, int pass),
+ TP_ARGS(log, trans, item, pass),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, item)
+ __field(xlog_tid_t, tid)
+ __field(int, type)
+ __field(int, pass)
+ __field(int, count)
+ __field(int, total)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->item = (unsigned long)item;
+ __entry->tid = trans->r_log_tid;
+ __entry->type = ITEM_TYPE(item);
+ __entry->pass = pass;
+ __entry->count = item->ri_cnt;
+ __entry->total = item->ri_total;
+ ),
+ TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+ "item region count/total %d/%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tid,
+ __entry->pass,
+ (void *)__entry->item,
+ __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+ __entry->count,
+ __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+ TP_PROTO(struct log *log, struct xlog_recover *trans, \
+ struct xlog_recover_item *item, int pass), \
+ TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+ TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+ TP_ARGS(log, buf_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(__int64_t, blkno)
+ __field(unsigned short, len)
+ __field(unsigned short, flags)
+ __field(unsigned short, size)
+ __field(unsigned int, map_size)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->blkno = buf_f->blf_blkno;
+ __entry->len = buf_f->blf_len;
+ __entry->flags = buf_f->blf_flags;
+ __entry->size = buf_f->blf_size;
+ __entry->map_size = buf_f->blf_map_size;
+ ),
+ TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+ "map_size %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->blkno,
+ __entry->len,
+ __entry->flags,
+ __entry->size,
+ __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+ TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+ TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+ TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+ TP_ARGS(log, in_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned short, size)
+ __field(int, fields)
+ __field(unsigned short, asize)
+ __field(unsigned short, dsize)
+ __field(__int64_t, blkno)
+ __field(int, len)
+ __field(int, boffset)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->ino = in_f->ilf_ino;
+ __entry->size = in_f->ilf_size;
+ __entry->fields = in_f->ilf_fields;
+ __entry->asize = in_f->ilf_asize;
+ __entry->dsize = in_f->ilf_dsize;
+ __entry->blkno = in_f->ilf_blkno;
+ __entry->len = in_f->ilf_len;
+ __entry->boffset = in_f->ilf_boffset;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+ "dsize %d, blkno 0x%llx, len %d, boffset %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->fields,
+ __entry->asize,
+ __entry->dsize,
+ __entry->blkno,
+ __entry->len,
+ __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+ TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+ TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd7..b89ec5d 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
* No need to re-initialize these if this is a reclaimed dquot.
*/
if (brandnewdquot) {
- dqp->dq_flnext = dqp->dq_flprev = dqp;
+ INIT_LIST_HEAD(&dqp->q_freelist);
mutex_init(&dqp->q_qlock);
init_waitqueue_head(&dqp->q_pinwait);
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
* Only the q_core portion was zeroed in dqreclaim_one().
* So, we need to reset others.
*/
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
- dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- atomic_set(&dqp->q_pincount, 0);
- dqp->q_hash = NULL;
- ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+ dqp->q_nrefs = 0;
+ dqp->q_blkno = 0;
+ INIT_LIST_HEAD(&dqp->q_mplist);
+ INIT_LIST_HEAD(&dqp->q_hashlist);
+ dqp->q_bufoffset = 0;
+ dqp->q_fileoffset = 0;
+ dqp->q_transp = NULL;
+ dqp->q_gdquot = NULL;
+ dqp->q_res_bcount = 0;
+ dqp->q_res_icount = 0;
+ dqp->q_res_rtbcount = 0;
+ atomic_set(&dqp->q_pincount, 0);
+ dqp->q_hash = NULL;
+ ASSERT(list_empty(&dqp->q_freelist));
trace_xfs_dqreuse(dqp);
}
@@ -158,7 +158,7 @@ void
xfs_qm_dqdestroy(
xfs_dquot_t *dqp)
{
- ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+ ASSERT(list_empty(&dqp->q_freelist));
mutex_destroy(&dqp->q_qlock);
sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
(be64_to_cpu(d->d_bcount) >=
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = cpu_to_be32(get_seconds() +
- XFS_QI_BTIMELIMIT(mp));
+ mp->m_quotainfo->qi_btimelimit);
} else {
d->d_bwarns = 0;
}
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
(be64_to_cpu(d->d_icount) >=
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = cpu_to_be32(get_seconds() +
- XFS_QI_ITIMELIMIT(mp));
+ mp->m_quotainfo->qi_itimelimit);
} else {
d->d_iwarns = 0;
}
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
(be64_to_cpu(d->d_rtbcount) >=
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = cpu_to_be32(get_seconds() +
- XFS_QI_RTBTIMELIMIT(mp));
+ mp->m_quotainfo->qi_rtbtimelimit);
} else {
d->d_rtbwarns = 0;
}
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
uint type,
xfs_buf_t *bp)
{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
xfs_dqblk_t *d;
int curid, i;
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
/*
* ID of the first dquot in the block - id's are zero based.
*/
- curid = id - (id % XFS_QM_DQPERBLK(mp));
+ curid = id - (id % q->qi_dqperchunk);
ASSERT(curid >= 0);
- memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
- for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+ memset(d, 0, BBTOB(q->qi_dqchunklen));
+ for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
xfs_qm_dqinit_core(curid, type, d);
xfs_trans_dquot_buf(tp, bp,
(type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
XFS_BLI_GDQUOT_BUF)));
- xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
/* now we can just get the buffer (there's nothing to read yet) */
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(mp),
+ mp->m_quotainfo->qi_dqchunklen,
0);
if (!bp || (error = XFS_BUF_GETERROR(bp)))
goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
*/
if (dqp->q_blkno == (xfs_daddr_t) 0) {
/* We use the id as an index */
- dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+ dqp->q_fileoffset = (xfs_fileoff_t)id /
+ mp->m_quotainfo->qi_dqperchunk;
nmaps = 1;
quotip = XFS_DQ_TO_QIP(dqp);
xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
/*
* offset of dquot in the (fixed sized) dquot chunk.
*/
- dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
sizeof(xfs_dqblk_t);
if (map.br_startblock == HOLESTARTBLOCK) {
/*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
* Read in the buffer, unless we've just done the allocation
* (in which case we already have the buf).
*/
- if (! newdquot) {
+ if (!newdquot) {
trace_xfs_dqtobp_read(dqp);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(mp),
- 0, &bp))) {
- return (error);
- }
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, &bp);
if (error || !bp)
return XFS_ERROR(error);
}
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
tp = NULL;
if (flags & XFS_QMOPT_DQALLOC) {
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- if ((error = xfs_trans_reserve(tp,
- XFS_QM_DQALLOC_SPACE_RES(mp),
- XFS_WRITE_LOG_RES(mp) +
- BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
- 128,
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT))) {
+ error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ XFS_WRITE_LOG_RES(mp) +
+ BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
+ 128,
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error) {
cancelflags = 0;
goto error0;
}
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
{
xfs_dquot_t *dqp;
uint flist_locked;
- xfs_dquot_t *d;
ASSERT(mutex_is_locked(&qh->qh_lock));
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
/*
* Traverse the hashchain looking for a match
*/
- for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+ list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
/*
* We already have the hashlock. We don't need the
* dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
/*
* All in core dquots must be on the dqlist of mp
*/
- ASSERT(dqp->MPL_PREVP != NULL);
+ ASSERT(!list_empty(&dqp->q_mplist));
xfs_dqlock(dqp);
if (dqp->q_nrefs == 0) {
- ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
- if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+ ASSERT(!list_empty(&dqp->q_freelist));
+ if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
trace_xfs_dqlookup_want(dqp);
/*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
*/
dqp->dq_flags |= XFS_DQ_WANT;
xfs_dqunlock(dqp);
- xfs_qm_freelist_lock(xfs_Gqm);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
xfs_dqlock(dqp);
dqp->dq_flags &= ~(XFS_DQ_WANT);
}
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
if (flist_locked) {
if (dqp->q_nrefs != 0) {
- xfs_qm_freelist_unlock(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
flist_locked = B_FALSE;
} else {
- /*
- * take it off the freelist
- */
+ /* take it off the freelist */
trace_xfs_dqlookup_freelist(dqp);
- XQM_FREELIST_REMOVE(dqp);
- /* xfs_qm_freelist_print(&(xfs_Gqm->
- qm_dqfreelist),
- "after removal"); */
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
}
}
- /*
- * grab a reference
- */
XFS_DQHOLD(dqp);
if (flist_locked)
- xfs_qm_freelist_unlock(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
/*
* move the dquot to the front of the hashchain
*/
ASSERT(mutex_is_locked(&qh->qh_lock));
- if (dqp->HL_PREVP != &qh->qh_next) {
- trace_xfs_dqlookup_move(dqp);
- if ((d = dqp->HL_NEXT))
- d->HL_PREVP = dqp->HL_PREVP;
- *(dqp->HL_PREVP) = d;
- d = qh->qh_next;
- d->HL_PREVP = &dqp->HL_NEXT;
- dqp->HL_NEXT = d;
- dqp->HL_PREVP = &qh->qh_next;
- qh->qh_next = dqp;
- }
+ list_move(&dqp->q_hashlist, &qh->qh_list);
trace_xfs_dqlookup_done(dqp);
*O_dqpp = dqp;
- ASSERT(mutex_is_locked(&qh->qh_lock));
- return (0);
+ return 0;
}
}
@@ -975,16 +956,17 @@ xfs_qm_dqget(
*/
if (ip) {
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (! XFS_IS_DQTYPE_ON(mp, type)) {
- /* inode stays locked on return */
- xfs_qm_dqdestroy(dqp);
- return XFS_ERROR(ESRCH);
- }
+
/*
* A dquot could be attached to this inode by now, since
* we had dropped the ilock.
*/
if (type == XFS_DQ_USER) {
+ if (!XFS_IS_UQUOTA_ON(mp)) {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
+ }
if (ip->i_udquot) {
xfs_qm_dqdestroy(dqp);
dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
goto dqret;
}
} else {
+ if (!XFS_IS_OQUOTA_ON(mp)) {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
+ }
if (ip->i_gdquot) {
xfs_qm_dqdestroy(dqp);
dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
*/
ASSERT(mutex_is_locked(&h->qh_lock));
dqp->q_hash = h;
- XQM_HASHLIST_INSERT(h, dqp);
+ list_add(&dqp->q_hashlist, &h->qh_list);
+ h->qh_version++;
/*
* Attach this dquot to this filesystem's list of all dquots,
* kept inside the mount structure in m_quotainfo field
*/
- xfs_qm_mplist_lock(mp);
+ mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
/*
* We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
xfs_dqlock(dqp);
dqp->q_nrefs = 1;
- XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
-
- xfs_qm_mplist_unlock(mp);
+ list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+ mp->m_quotainfo->qi_dquots++;
+ mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
mutex_unlock(&h->qh_lock);
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
* drop the dqlock and acquire the freelist and dqlock
* in the right order; but try to get it out-of-order first
*/
- if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+ if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
trace_xfs_dqput_wait(dqp);
xfs_dqunlock(dqp);
- xfs_qm_freelist_lock(xfs_Gqm);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
xfs_dqlock(dqp);
}
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
if (--dqp->q_nrefs == 0) {
trace_xfs_dqput_free(dqp);
- /*
- * insert at end of the freelist.
- */
- XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
+ list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+ xfs_Gqm->qm_dqfrlist_cnt++;
/*
* If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
xfs_dqlock(gdqp);
dqp->q_gdquot = NULL;
}
-
- /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
- "@@@@@++ Free list (after append) @@@@@+");
- */
}
xfs_dqunlock(dqp);
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
break;
dqp = gdqp;
}
- xfs_qm_freelist_unlock(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
}
/*
@@ -1386,10 +1368,10 @@ int
xfs_qm_dqpurge(
xfs_dquot_t *dqp)
{
- xfs_dqhash_t *thishash;
+ xfs_dqhash_t *qh = dqp->q_hash;
xfs_mount_t *mp = dqp->q_mount;
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+ ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
return (1);
}
- ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+ ASSERT(!list_empty(&dqp->q_freelist));
/*
* If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
!(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
- thishash = dqp->q_hash;
- XQM_HASHLIST_REMOVE(thishash, dqp);
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+ list_del_init(&dqp->q_hashlist);
+ qh->qh_version++;
+ list_del_init(&dqp->q_mplist);
+ mp->m_quotainfo->qi_dqreclaims++;
+ mp->m_quotainfo->qi_dquots--;
/*
* XXX Move this to the front of the freelist, if we can get the
* freelist lock.
*/
- ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+ ASSERT(!list_empty(&dqp->q_freelist));
dqp->q_mount = NULL;
dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
memset(&dqp->q_core, 0, sizeof(dqp->q_core));
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- mutex_unlock(&thishash->qh_lock);
+ mutex_unlock(&qh->qh_lock);
return (0);
}
@@ -1517,6 +1501,7 @@ void
xfs_qm_dqflock_pushbuf_wait(
xfs_dquot_t *dqp)
{
+ xfs_mount_t *mp = dqp->q_mount;
xfs_buf_t *bp;
/*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
* out immediately. We'll be able to acquire
* the flush lock when the I/O completes.
*/
- bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
- XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
+ bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
if (!bp)
goto out_lock;
if (XFS_BUF_ISDELAYWRITE(bp)) {
if (XFS_BUF_ISPINNED(bp))
- xfs_log_force(dqp->q_mount, 0);
+ xfs_log_force(mp, 0);
xfs_buf_delwri_promote(bp);
wake_up_process(bp->b_target->bt_task);
}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da5..5da3a23 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
* The hash chain headers (hash buckets)
*/
typedef struct xfs_dqhash {
- struct xfs_dquot *qh_next;
+ struct list_head qh_list;
struct mutex qh_lock;
uint qh_version; /* ever increasing version */
uint qh_nelems; /* number of dquots on the list */
} xfs_dqhash_t;
-typedef struct xfs_dqlink {
- struct xfs_dquot *ql_next; /* forward link */
- struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */
-} xfs_dqlink_t;
-
struct xfs_mount;
struct xfs_trans;
/*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
- struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */
- struct xfs_dquot*dqm_flprev;
- xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */
- xfs_dqlink_t dqm_hashlist; /* link to the hash chain */
- uint dqm_flags; /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-
-/*
* The incore dquot structure
*/
typedef struct xfs_dquot {
- xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */
+ uint dq_flags; /* various flags (XFS_DQ_*) */
+ struct list_head q_freelist; /* global free list of dquots */
+ struct list_head q_mplist; /* mount's list of dquots */
+ struct list_head q_hashlist; /* gloabl hash list of dquots */
xfs_dqhash_t *q_hash; /* the hashchain header */
struct xfs_mount*q_mount; /* filesystem this relates to */
struct xfs_trans*q_transp; /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
} xfs_dquot_t;
-
-#define dq_flnext q_lists.dqm_flnext
-#define dq_flprev q_lists.dqm_flprev
-#define dq_mplist q_lists.dqm_mplist
-#define dq_hashlist q_lists.dqm_hashlist
-#define dq_flags q_lists.dqm_flags
-
/*
* Lock hierarchy for q_qlock:
* XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
}
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a..8d89a24 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
/* ARGSUSED */
STATIC void
xfs_qm_dquot_logitem_unpin(
- xfs_dq_logitem_t *logitem,
- int stale)
+ xfs_dq_logitem_t *logitem)
{
xfs_dquot_t *dqp = logitem->qli_dquot;
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
xfs_dq_logitem_t *logitem,
xfs_trans_t *tp)
{
- xfs_qm_dquot_logitem_unpin(logitem, 0);
+ xfs_qm_dquot_logitem_unpin(logitem);
}
/*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
}
mp = dqp->q_mount;
bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
- XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
xfs_dqunlock(dqp);
if (!bp)
return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_qm_dquot_logitem_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))
- xfs_qm_dquot_logitem_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
xfs_qm_dquot_logitem_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
xfs_dq_logitem_t *lp;
lp = &dqp->q_logitem;
- lp->qli_item.li_type = XFS_LI_DQUOT;
- lp->qli_item.li_ops = &xfs_dquot_item_ops;
- lp->qli_item.li_mountp = dqp->q_mount;
+ xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+ &xfs_dquot_item_ops);
lp->qli_dquot = dqp;
lp->qli_format.qlf_type = XFS_LI_DQUOT;
lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
*/
/*ARGSUSED*/
STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
{
return;
}
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_qm_qoff_logitem_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t* ,int))
- xfs_qm_qoff_logitem_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
xfs_qm_qoff_logitem_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_qm_qoff_logitem_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))
- xfs_qm_qoff_logitem_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
xfs_qm_qoff_logitem_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
- qf->qql_item.li_type = XFS_LI_QUOTAOFF;
- if (start)
- qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
- else
- qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
+ xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+ &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
qf->qql_item.li_mountp = mp;
qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e..38e7641 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t xfs_zerocr;
STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
-STATIC void xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *);
-
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
STATIC int xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex qcheck_lock;
#endif
#ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
-{ \
- xfs_dquot_t *dqp; int i = 0; \
- cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
- for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
- cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \
- "bcnt = %d, icnt = %d, refs = %d", \
- ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
- DQFLAGTO_TYPESTR(dqp), \
- (int) be64_to_cpu(dqp->q_core.d_bcount), \
- (int) be64_to_cpu(dqp->q_core.d_icount), \
- (int) dqp->q_nrefs); } \
+static void
+xfs_qm_dquot_list_print(
+ struct xfs_mount *mp)
+{
+ xfs_dquot_t *dqp;
+ int i = 0;
+
+ list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
+ cmn_err(CE_DEBUG, " %d. \"%d (%s)\" "
+ "bcnt = %lld, icnt = %lld, refs = %d",
+ i++, be32_to_cpu(dqp->q_core.d_id),
+ DQFLAGTO_TYPESTR(dqp),
+ (long long)be64_to_cpu(dqp->q_core.d_bcount),
+ (long long)be64_to_cpu(dqp->q_core.d_icount),
+ dqp->q_nrefs);
+ }
}
#else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
#endif
/*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
/*
* Freelist of all dquots of all file systems
*/
- xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+ INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+ xqm->qm_dqfrlist_cnt = 0;
+ mutex_init(&xqm->qm_dqfrlist_lock);
/*
* dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
xfs_qm_destroy(
struct xfs_qm *xqm)
{
+ struct xfs_dquot *dqp, *n;
int hsize, i;
ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
xqm->qm_usr_dqhtable = NULL;
xqm->qm_grp_dqhtable = NULL;
xqm->qm_dqhashmask = 0;
- xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+
+ /* frlist cleanup */
+ mutex_lock(&xqm->qm_dqfrlist_lock);
+ list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+ xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+ cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ xfs_dqunlock(dqp);
+ xfs_qm_dqdestroy(dqp);
+ }
+ mutex_unlock(&xqm->qm_dqfrlist_lock);
+ mutex_destroy(&xqm->qm_dqfrlist_lock);
#ifdef DEBUG
mutex_destroy(&qcheck_lock);
#endif
@@ -256,7 +274,7 @@ STATIC void
xfs_qm_rele_quotafs_ref(
struct xfs_mount *mp)
{
- xfs_dquot_t *dqp, *nextdqp;
+ xfs_dquot_t *dqp, *n;
ASSERT(xfs_Gqm);
ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
/*
* Go thru the freelist and destroy all inactive dquots.
*/
- xfs_qm_freelist_lock(xfs_Gqm);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
- dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
+ list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
xfs_dqlock(dqp);
- nextdqp = dqp->dq_flnext;
if (dqp->dq_flags & XFS_DQ_INACTIVE) {
ASSERT(dqp->q_mount == NULL);
ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
- XQM_FREELIST_REMOVE(dqp);
+ ASSERT(list_empty(&dqp->q_hashlist));
+ ASSERT(list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
xfs_dqunlock(dqp);
xfs_qm_dqdestroy(dqp);
} else {
xfs_dqunlock(dqp);
}
- dqp = nextdqp;
}
- xfs_qm_freelist_unlock(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
/*
* Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
struct xfs_mount *mp)
{
if (mp->m_quotainfo) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
xfs_qm_destroy_quotainfo(mp);
}
}
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
*/
STATIC int
xfs_qm_dqflush_all(
- xfs_mount_t *mp,
- int sync_mode)
+ struct xfs_mount *mp,
+ int sync_mode)
{
- int recl;
- xfs_dquot_t *dqp;
- int niters;
- int error;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ int recl;
+ struct xfs_dquot *dqp;
+ int niters;
+ int error;
- if (mp->m_quotainfo == NULL)
+ if (!q)
return 0;
niters = 0;
again:
- xfs_qm_mplist_lock(mp);
- FOREACH_DQUOT_IN_MP(dqp, mp) {
+ mutex_lock(&q->qi_dqlist_lock);
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
xfs_dqlock(dqp);
if (! XFS_DQ_IS_DIRTY(dqp)) {
xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
}
/* XXX a sentinel would be better */
- recl = XFS_QI_MPLRECLAIMS(mp);
+ recl = q->qi_dqreclaims;
if (!xfs_dqflock_nowait(dqp)) {
/*
* If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
* Let go of the mplist lock. We don't want to hold it
* across a disk write.
*/
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
error = xfs_qm_dqflush(dqp, sync_mode);
xfs_dqunlock(dqp);
if (error)
return error;
- xfs_qm_mplist_lock(mp);
- if (recl != XFS_QI_MPLRECLAIMS(mp)) {
- xfs_qm_mplist_unlock(mp);
+ mutex_lock(&q->qi_dqlist_lock);
+ if (recl != q->qi_dqreclaims) {
+ mutex_unlock(&q->qi_dqlist_lock);
/* XXX restart limit */
goto again;
}
}
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
/* return ! busy */
return 0;
}
@@ -509,15 +526,15 @@ again:
*/
STATIC void
xfs_qm_detach_gdquots(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_dquot_t *dqp, *gdqp;
- int nrecl;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_dquot *dqp, *gdqp;
+ int nrecl;
again:
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
- dqp = XFS_QI_MPLNEXT(mp);
- while (dqp) {
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
xfs_dqlock(dqp);
if ((gdqp = dqp->q_gdquot)) {
xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
* Can't hold the mplist lock across a dqput.
* XXXmust convert to marker based iterations here.
*/
- nrecl = XFS_QI_MPLRECLAIMS(mp);
- xfs_qm_mplist_unlock(mp);
+ nrecl = q->qi_dqreclaims;
+ mutex_unlock(&q->qi_dqlist_lock);
xfs_qm_dqput(gdqp);
- xfs_qm_mplist_lock(mp);
- if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+ mutex_lock(&q->qi_dqlist_lock);
+ if (nrecl != q->qi_dqreclaims)
goto again;
}
- dqp = dqp->MPL_NEXT;
}
}
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
*/
STATIC int
xfs_qm_dqpurge_int(
- xfs_mount_t *mp,
- uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+ struct xfs_mount *mp,
+ uint flags)
{
- xfs_dquot_t *dqp;
- uint dqtype;
- int nrecl;
- xfs_dquot_t *nextdqp;
- int nmisses;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_dquot *dqp, *n;
+ uint dqtype;
+ int nrecl;
+ int nmisses;
- if (mp->m_quotainfo == NULL)
+ if (!q)
return 0;
dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
- xfs_qm_mplist_lock(mp);
+ mutex_lock(&q->qi_dqlist_lock);
/*
* In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
again:
nmisses = 0;
- ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
/*
* Try to get rid of all of the unwanted dquots. The idea is to
* get them off mplist and hashlist, but leave them on freelist.
*/
- dqp = XFS_QI_MPLNEXT(mp);
- while (dqp) {
+ list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
/*
* It's OK to look at the type without taking dqlock here.
* We're holding the mplist lock here, and that's needed for
* a dqreclaim.
*/
- if ((dqp->dq_flags & dqtype) == 0) {
- dqp = dqp->MPL_NEXT;
+ if ((dqp->dq_flags & dqtype) == 0)
continue;
- }
if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- nrecl = XFS_QI_MPLRECLAIMS(mp);
- xfs_qm_mplist_unlock(mp);
+ nrecl = q->qi_dqreclaims;
+ mutex_unlock(&q->qi_dqlist_lock);
mutex_lock(&dqp->q_hash->qh_lock);
- xfs_qm_mplist_lock(mp);
+ mutex_lock(&q->qi_dqlist_lock);
/*
* XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
* No one can be adding dquots to the mplist at
* this point, but somebody might be taking things off.
*/
- if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+ if (nrecl != q->qi_dqreclaims) {
mutex_unlock(&dqp->q_hash->qh_lock);
goto again;
}
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
* Take the dquot off the mplist and hashlist. It may remain on
* freelist in INACTIVE state.
*/
- nextdqp = dqp->MPL_NEXT;
nmisses += xfs_qm_dqpurge(dqp);
- dqp = nextdqp;
}
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
return nmisses;
}
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
int
xfs_qm_sync(
- xfs_mount_t *mp,
- int flags)
+ struct xfs_mount *mp,
+ int flags)
{
- int recl, restarts;
- xfs_dquot_t *dqp;
- int error;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ int recl, restarts;
+ struct xfs_dquot *dqp;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
restarts = 0;
again:
- xfs_qm_mplist_lock(mp);
+ mutex_lock(&q->qi_dqlist_lock);
/*
* dqpurge_all() also takes the mplist lock and iterate thru all dquots
* in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
* when we have the mplist lock, we know that dquots will be consistent
* as long as we have it locked.
*/
- if (! XFS_IS_QUOTA_ON(mp)) {
- xfs_qm_mplist_unlock(mp);
+ if (!XFS_IS_QUOTA_ON(mp)) {
+ mutex_unlock(&q->qi_dqlist_lock);
return 0;
}
- FOREACH_DQUOT_IN_MP(dqp, mp) {
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
/*
* If this is vfs_sync calling, then skip the dquots that
* don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
}
/* XXX a sentinel would be better */
- recl = XFS_QI_MPLRECLAIMS(mp);
+ recl = q->qi_dqreclaims;
if (!xfs_dqflock_nowait(dqp)) {
if (flags & SYNC_TRYLOCK) {
xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
* Let go of the mplist lock. We don't want to hold it
* across a disk write
*/
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
error = xfs_qm_dqflush(dqp, flags);
xfs_dqunlock(dqp);
if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
else if (error)
return error;
- xfs_qm_mplist_lock(mp);
- if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+ mutex_lock(&q->qi_dqlist_lock);
+ if (recl != q->qi_dqreclaims) {
if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
break;
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
goto again;
}
}
- xfs_qm_mplist_unlock(mp);
+ mutex_unlock(&q->qi_dqlist_lock);
return 0;
}
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
return error;
}
- xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
- lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+ INIT_LIST_HEAD(&qinf->qi_dqlist);
+ mutex_init(&qinf->qi_dqlist_lock);
+ lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
qinf->qi_dqreclaims = 0;
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
*/
xfs_qm_rele_quotafs_ref(mp);
- xfs_qm_list_destroy(&qi->qi_dqlist);
+ ASSERT(list_empty(&qi->qi_dqlist));
+ mutex_destroy(&qi->qi_dqlist_lock);
if (qi->qi_uquotaip) {
IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
int n)
{
mutex_init(&list->qh_lock);
- list->qh_next = NULL;
+ INIT_LIST_HEAD(&list->qh_list);
list->qh_version = 0;
list->qh_nelems = 0;
}
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
*/
spin_lock(&mp->m_sb_lock);
if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
/* qflags will get updated _after_ quotacheck */
mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- cmn_err(CE_NOTE,
- "Old superblock version %x, converting to %x.",
- oldv, mp->m_sb.sb_versionnum);
-#endif
}
if (flags & XFS_QMOPT_UQUOTA)
mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
#ifdef DEBUG
j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
do_div(j, sizeof(xfs_dqblk_t));
- ASSERT(XFS_QM_DQPERBLK(mp) == j);
+ ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
- for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+ for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
while (blkcnt--) {
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, bno),
- (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
if (error)
break;
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
* goto the next block.
*/
bno++;
- firstid += XFS_QM_DQPERBLK(mp);
+ firstid += mp->m_quotainfo->qi_dqperchunk;
}
return error;
}
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
continue;
firstid = (xfs_dqid_t) map[i].br_startoff *
- XFS_QM_DQPERBLK(mp);
+ mp->m_quotainfo->qi_dqperchunk;
/*
* Do a read-ahead on the next extent.
*/
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
while (rablkcnt--) {
xfs_baread(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, rablkno),
- (int)XFS_QI_DQCHUNKLEN(mp));
+ mp->m_quotainfo->qi_dqchunklen);
rablkno++;
}
}
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
/*
* Set default limits, adjust timers (since we changed usages)
+ *
+ * There are no timers for the default values set in the root dquot.
*/
- if (! XFS_IS_SUSER_DQUOT(dqp)) {
+ if (dqp->q_core.d_id) {
xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
}
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
lastino = 0;
flags = 0;
- ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+ ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
/*
* There should be no cached dquots. The (simplistic) quotacheck
* algorithm doesn't like that.
*/
- ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+ ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
* their counters to zero. We need a clean slate.
* We don't log our changes till later.
*/
- if ((uip = XFS_QI_UQIP(mp))) {
- if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+ uip = mp->m_quotainfo->qi_uquotaip;
+ if (uip) {
+ error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+ if (error)
goto error_return;
flags |= XFS_UQUOTA_CHKD;
}
- if ((gip = XFS_QI_GQIP(mp))) {
- if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
- XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+ gip = mp->m_quotainfo->qi_gquotaip;
+ if (gip) {
+ error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+ XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ if (error)
goto error_return;
flags |= XFS_OQUOTA_CHKD;
}
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
* at this point (because we intentionally didn't in dqget_noattach).
*/
if (error) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
goto error_return;
}
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
mp->m_qflags |= flags;
- XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+ xfs_qm_dquot_list_print(mp);
error_return:
if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
}
}
- XFS_QI_UQIP(mp) = uip;
- XFS_QI_GQIP(mp) = gip;
+ mp->m_quotainfo->qi_uquotaip = uip;
+ mp->m_quotainfo->qi_gquotaip = gip;
return 0;
}
+
/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
*/
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
{
- int nreclaimed;
- xfs_dqhash_t *hash;
- xfs_dquot_t *dqp, *nextdqp;
+ xfs_dquot_t *dqpout;
+ xfs_dquot_t *dqp;
int restarts;
- int nflushes;
-
- if (howmany <= 0)
- return 0;
- nreclaimed = 0;
restarts = 0;
- nflushes = 0;
+ dqpout = NULL;
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
-#endif
- /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
- xfs_qm_freelist_lock(xfs_Gqm);
+ /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+startagain:
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
- for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
- ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
- nreclaimed < howmany); ) {
+ list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+ struct xfs_mount *mp = dqp->q_mount;
xfs_dqlock(dqp);
/*
* We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants.
+ * want to reclaim a dquot that lookup wants. We release the
+ * freelist lock and start over, so that lookup will grab
+ * both the dquot and the freelistlock.
*/
if (dqp->dq_flags & XFS_DQ_WANT) {
+ ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+
+ trace_xfs_dqreclaim_want(dqp);
+
xfs_dqunlock(dqp);
- xfs_qm_freelist_unlock(xfs_Gqm);
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return nreclaimed;
+ return NULL;
XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- goto tryagain;
+ goto startagain;
}
/*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
* life easier.
*/
if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
+ ASSERT(mp == NULL);
ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
+ ASSERT(list_empty(&dqp->q_hashlist));
+ ASSERT(list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ xfs_dqunlock(dqp);
+ dqpout = dqp;
XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- nextdqp = dqp->dq_flnext;
- goto off_freelist;
+ break;
}
- ASSERT(dqp->MPL_PREVP);
+ ASSERT(dqp->q_hash);
+ ASSERT(!list_empty(&dqp->q_mplist));
+
/*
* Try to grab the flush lock. If this dquot is in the process of
* getting flushed to disk, we don't want to reclaim it.
*/
if (!xfs_dqflock_nowait(dqp)) {
xfs_dqunlock(dqp);
- dqp = dqp->dq_flnext;
continue;
}
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
if (XFS_DQ_IS_DIRTY(dqp)) {
int error;
- trace_xfs_dqshake_dirty(dqp);
+ trace_xfs_dqreclaim_dirty(dqp);
/*
* We flush it delayed write, so don't bother
- * releasing the mplock.
+ * releasing the freelist lock.
*/
error = xfs_qm_dqflush(dqp, 0);
if (error) {
- xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
- "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
}
xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
- dqp = dqp->dq_flnext;
continue;
}
+
/*
* We're trying to get the hashlock out of order. This races
* with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
* waiting for the freelist lock.
*/
if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- dqp = dqp->dq_flnext;
- continue;
+ restarts++;
+ goto dqfunlock;
}
+
/*
* This races with dquot allocation code as well as dqflush_all
* and reclaim code. So, if we failed to grab the mplist lock,
* giveup everything and start over.
*/
- hash = dqp->q_hash;
- ASSERT(hash);
- if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
- /* XXX put a sentinel so that we can come back here */
+ if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
+ restarts++;
+ mutex_unlock(&dqp->q_hash->qh_lock);
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- mutex_unlock(&hash->qh_lock);
- xfs_qm_freelist_unlock(xfs_Gqm);
- if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return nreclaimed;
- goto tryagain;
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
+ return NULL;
+ goto startagain;
}
- trace_xfs_dqshake_unlink(dqp);
-
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
- dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
ASSERT(dqp->q_nrefs == 0);
- nextdqp = dqp->dq_flnext;
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
- XQM_HASHLIST_REMOVE(hash, dqp);
+ list_del_init(&dqp->q_mplist);
+ mp->m_quotainfo->qi_dquots--;
+ mp->m_quotainfo->qi_dqreclaims++;
+ list_del_init(&dqp->q_hashlist);
+ dqp->q_hash->qh_version++;
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ dqpout = dqp;
+ mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+ mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
xfs_dqfunlock(dqp);
- xfs_qm_mplist_unlock(dqp->q_mount);
- mutex_unlock(&hash->qh_lock);
-
- off_freelist:
- XQM_FREELIST_REMOVE(dqp);
xfs_dqunlock(dqp);
- nreclaimed++;
- XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+ if (dqpout)
+ break;
+ if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+ return NULL;
+ }
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ return dqpout;
+}
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+ int howmany)
+{
+ int nreclaimed = 0;
+ xfs_dquot_t *dqp;
+
+ if (howmany <= 0)
+ return 0;
+
+ while (nreclaimed < howmany) {
+ dqp = xfs_qm_dqreclaim_one();
+ if (!dqp)
+ return nreclaimed;
xfs_qm_dqdestroy(dqp);
- dqp = nextdqp;
+ nreclaimed++;
}
- xfs_qm_freelist_unlock(xfs_Gqm);
return nreclaimed;
}
-
/*
* The kmem_shake interface is invoked when memory is running low.
*/
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
if (!xfs_Gqm)
return 0;
- nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+ nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
/* incore dquots in all f/s's */
ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
}
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
- xfs_dquot_t *dqpout;
- xfs_dquot_t *dqp;
- int restarts;
- int nflushes;
-
- restarts = 0;
- dqpout = NULL;
- nflushes = 0;
-
- /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
- xfs_qm_freelist_lock(xfs_Gqm);
-
- FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
- xfs_dqlock(dqp);
-
- /*
- * We are racing with dqlookup here. Naturally we don't
- * want to reclaim a dquot that lookup wants. We release the
- * freelist lock and start over, so that lookup will grab
- * both the dquot and the freelistlock.
- */
- if (dqp->dq_flags & XFS_DQ_WANT) {
- ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-
- trace_xfs_dqreclaim_want(dqp);
-
- xfs_dqunlock(dqp);
- xfs_qm_freelist_unlock(xfs_Gqm);
- if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- return NULL;
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- goto startagain;
- }
-
- /*
- * If the dquot is inactive, we are assured that it is
- * not on the mplist or the hashlist, and that makes our
- * life easier.
- */
- if (dqp->dq_flags & XFS_DQ_INACTIVE) {
- ASSERT(dqp->q_mount == NULL);
- ASSERT(! XFS_DQ_IS_DIRTY(dqp));
- ASSERT(dqp->HL_PREVP == NULL);
- ASSERT(dqp->MPL_PREVP == NULL);
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- dqpout = dqp;
- XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
- break;
- }
-
- ASSERT(dqp->q_hash);
- ASSERT(dqp->MPL_PREVP);
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- continue;
- }
-
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- error = xfs_qm_dqflush(dqp, 0);
- if (error) {
- xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
- "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
- }
- xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
- continue;
- }
-
- if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- continue;
- }
-
- if (!mutex_trylock(&dqp->q_hash->qh_lock))
- goto mplistunlock;
-
- trace_xfs_dqreclaim_unlink(dqp);
-
- ASSERT(dqp->q_nrefs == 0);
- XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
- XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
- XQM_FREELIST_REMOVE(dqp);
- dqpout = dqp;
- mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
- xfs_qm_mplist_unlock(dqp->q_mount);
- xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
- if (dqpout)
- break;
- }
-
- xfs_qm_freelist_unlock(xfs_Gqm);
- return dqpout;
-}
-
-
/*------------------------------------------------------------------*/
/*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
}
}
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
- ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
- mutex_init(&ql->qh_lock);
- ql->qh_version = 0;
- ql->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
- xfs_dquot_t *dqp, *nextdqp;
-
- mutex_lock(&ql->qh_lock);
- for (dqp = ql->qh_next;
- dqp != (xfs_dquot_t *)ql; ) {
- xfs_dqlock(dqp);
- nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
- cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
- XQM_FREELIST_REMOVE(dqp);
- xfs_dqunlock(dqp);
- xfs_qm_dqdestroy(dqp);
- dqp = nextdqp;
- }
- mutex_unlock(&ql->qh_lock);
- mutex_destroy(&ql->qh_lock);
-
- ASSERT(ql->qh_nelems == 0);
-}
-
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
- dq->dq_flnext = ql->qh_next;
- dq->dq_flprev = (xfs_dquot_t *)ql;
- ql->qh_next = dq;
- dq->dq_flnext->dq_flprev = dq;
- xfs_Gqm->qm_dqfreelist.qh_nelems++;
- xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
- xfs_dquot_t *next = dq->dq_flnext;
- xfs_dquot_t *prev = dq->dq_flprev;
-
- next->dq_flprev = prev;
- prev->dq_flnext = next;
- dq->dq_flnext = dq->dq_flprev = dq;
- xfs_Gqm->qm_dqfreelist.qh_nelems--;
- xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
- xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b..c9446f1 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone;
#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
typedef xfs_dqhash_t xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
- struct xfs_dquot *qh_next;
- struct xfs_dquot *qh_prev;
- struct mutex qh_lock;
- uint qh_version;
- uint qh_nelems;
-} xfs_frlist_t;
/*
* Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
- xfs_frlist_t qm_dqfreelist; /* freelist of dquots */
+ struct list_head qm_dqfrlist; /* freelist of dquots */
+ struct mutex qm_dqfrlist_lock;
+ int qm_dqfrlist_cnt;
atomic_t qm_totaldquots; /* total incore dquots */
uint qm_nrefs; /* file systems with quota on */
int qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
typedef struct xfs_quotainfo {
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- xfs_dqlist_t qi_dqlist; /* all dquots in filesys */
+ struct list_head qi_dqlist; /* all dquots in filesys */
+ struct mutex qi_dqlist_lock;
+ int qi_dquots;
int qi_dqreclaims; /* a change here indicates
a removal in the dqlist */
time_t qi_btimelimit; /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* list stuff */
-extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void xfs_qm_freelist_unlink(xfs_dquot_t *);
-
#ifdef DEBUG
extern int xfs_qm_internalqcheck(xfs_mount_t *);
#else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3..3d1fc79 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
ndquot,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+ xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
return 0;
}
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 50bee07..26fa431 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
xfs_mount_t *mp,
uint flags)
{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
uint dqtype;
int error;
uint inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
* critical thing.
* If quotaoff, then we must be dealing with the root filesystem.
*/
- ASSERT(mp->m_quotainfo);
- if (mp->m_quotainfo)
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
- ASSERT(mp->m_quotainfo);
+ ASSERT(q);
+ mutex_lock(&q->qi_quotaofflock);
/*
* If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_qflags = mp->m_qflags;
spin_unlock(&mp->m_sb_lock);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ mutex_unlock(&q->qi_quotaofflock);
/* XXX what to do if error ? Revert back to old vals incore ? */
error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
* Nothing to do? Don't complain. This happens when we're just
* turning off quota enforcement.
*/
- if ((mp->m_qflags & flags) == 0) {
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- return (0);
- }
+ if ((mp->m_qflags & flags) == 0)
+ goto out_unlock;
/*
* Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
*/
error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
if (error)
- goto out_error;
+ goto out_unlock;
/*
* Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
* So, if we couldn't purge all the dquots from the filesystem,
* we can't get rid of the incore data structures.
*/
- while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+ while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
delay(10 * nculprits);
/*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
if (error) {
/* We're screwed now. Shutdown is the only option. */
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- goto out_error;
+ goto out_unlock;
}
/*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
*/
if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ mutex_unlock(&q->qi_quotaofflock);
xfs_qm_destroy_quotainfo(mp);
return (0);
}
/*
- * Release our quotainode references, and vn_purge them,
- * if we don't need them anymore.
+ * Release our quotainode references if we don't need them anymore.
*/
- if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
- IRELE(XFS_QI_UQIP(mp));
- XFS_QI_UQIP(mp) = NULL;
+ if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+ IRELE(q->qi_uquotaip);
+ q->qi_uquotaip = NULL;
}
- if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
- IRELE(XFS_QI_GQIP(mp));
- XFS_QI_GQIP(mp) = NULL;
+ if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+ IRELE(q->qi_gquotaip);
+ q->qi_gquotaip = NULL;
}
-out_error:
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
- return (error);
+out_unlock:
+ mutex_unlock(&q->qi_quotaofflock);
+ return error;
}
int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
/*
* Switch on quota enforcement in core.
*/
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
return (0);
}
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
*/
int
xfs_qm_scall_getqstat(
- xfs_mount_t *mp,
- fs_quota_stat_t *out)
+ struct xfs_mount *mp,
+ struct fs_quota_stat *out)
{
- xfs_inode_t *uip, *gip;
- boolean_t tempuqip, tempgqip;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_inode *uip, *gip;
+ boolean_t tempuqip, tempgqip;
uip = gip = NULL;
tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
- if (mp->m_quotainfo) {
- uip = mp->m_quotainfo->qi_uquotaip;
- gip = mp->m_quotainfo->qi_gquotaip;
+ if (q) {
+ uip = q->qi_uquotaip;
+ gip = q->qi_gquotaip;
}
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,15 +437,15 @@ xfs_qm_scall_getqstat(
if (tempgqip)
IRELE(gip);
}
- if (mp->m_quotainfo) {
- out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
- out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
- out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
- out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
- out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
- out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+ if (q) {
+ out->qs_incoredqs = q->qi_dquots;
+ out->qs_btimelimit = q->qi_btimelimit;
+ out->qs_itimelimit = q->qi_itimelimit;
+ out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+ out->qs_bwarnlimit = q->qi_bwarnlimit;
+ out->qs_iwarnlimit = q->qi_iwarnlimit;
}
- return (0);
+ return 0;
}
/*
@@ -462,6 +458,7 @@ xfs_qm_scall_setqlim(
uint type,
fs_disk_quota_t *newlim)
{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
xfs_disk_dquot_t *ddq;
xfs_dquot_t *dqp;
xfs_trans_t *tp;
@@ -485,7 +482,7 @@ xfs_qm_scall_setqlim(
* a quotaoff from happening). (XXXThis doesn't currently happen
* because we take the vfslock before calling xfs_qm_sysent).
*/
- mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+ mutex_lock(&q->qi_quotaofflock);
/*
* Get the dquot (locked), and join it to the transaction.
@@ -493,9 +490,8 @@ xfs_qm_scall_setqlim(
*/
if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
ASSERT(error != ENOENT);
- return (error);
+ goto out_unlock;
}
xfs_trans_dqjoin(tp, dqp);
ddq = &dqp->q_core;
@@ -513,8 +509,8 @@ xfs_qm_scall_setqlim(
ddq->d_blk_hardlimit = cpu_to_be64(hard);
ddq->d_blk_softlimit = cpu_to_be64(soft);
if (id == 0) {
- mp->m_quotainfo->qi_bhardlimit = hard;
- mp->m_quotainfo->qi_bsoftlimit = soft;
+ q->qi_bhardlimit = hard;
+ q->qi_bsoftlimit = soft;
}
} else {
qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +525,8 @@ xfs_qm_scall_setqlim(
ddq->d_rtb_hardlimit = cpu_to_be64(hard);
ddq->d_rtb_softlimit = cpu_to_be64(soft);
if (id == 0) {
- mp->m_quotainfo->qi_rtbhardlimit = hard;
- mp->m_quotainfo->qi_rtbsoftlimit = soft;
+ q->qi_rtbhardlimit = hard;
+ q->qi_rtbsoftlimit = soft;
}
} else {
qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +542,8 @@ xfs_qm_scall_setqlim(
ddq->d_ino_hardlimit = cpu_to_be64(hard);
ddq->d_ino_softlimit = cpu_to_be64(soft);
if (id == 0) {
- mp->m_quotainfo->qi_ihardlimit = hard;
- mp->m_quotainfo->qi_isoftlimit = soft;
+ q->qi_ihardlimit = hard;
+ q->qi_isoftlimit = soft;
}
} else {
qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +568,23 @@ xfs_qm_scall_setqlim(
* for warnings.
*/
if (newlim->d_fieldmask & FS_DQ_BTIMER) {
- mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+ q->qi_btimelimit = newlim->d_btimer;
ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
}
if (newlim->d_fieldmask & FS_DQ_ITIMER) {
- mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+ q->qi_itimelimit = newlim->d_itimer;
ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
}
if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
- mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+ q->qi_rtbtimelimit = newlim->d_rtbtimer;
ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
}
if (newlim->d_fieldmask & FS_DQ_BWARNS)
- mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+ q->qi_bwarnlimit = newlim->d_bwarns;
if (newlim->d_fieldmask & FS_DQ_IWARNS)
- mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+ q->qi_iwarnlimit = newlim->d_iwarns;
if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
- mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+ q->qi_rtbwarnlimit = newlim->d_rtbwarns;
} else {
/*
* If the user is now over quota, start the timelimit.
@@ -605,8 +601,9 @@ xfs_qm_scall_setqlim(
error = xfs_trans_commit(tp, 0);
xfs_qm_dqprint(dqp);
xfs_qm_dqrele(dqp);
- mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ out_unlock:
+ mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -853,7 +850,8 @@ xfs_dqrele_inode(
int error;
/* skip quota inodes */
- if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+ ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
read_unlock(&pag->pag_ici_lock);
@@ -931,7 +929,8 @@ struct mutex qcheck_lock;
}
typedef struct dqtest {
- xfs_dqmarker_t q_lists;
+ uint dq_flags; /* various flags (XFS_DQ_*) */
+ struct list_head q_hashlist;
xfs_dqhash_t *q_hash; /* the hashchain header */
xfs_mount_t *q_mount; /* filesystem this relates to */
xfs_dqid_t d_id; /* user id or group id */
@@ -942,14 +941,9 @@ typedef struct dqtest {
STATIC void
xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
{
- xfs_dquot_t *d;
- if (((d) = (h)->qh_next))
- (d)->HL_PREVP = &((dqp)->HL_NEXT);
- (dqp)->HL_NEXT = d;
- (dqp)->HL_PREVP = &((h)->qh_next);
- (h)->qh_next = (xfs_dquot_t *)dqp;
- (h)->qh_version++;
- (h)->qh_nelems++;
+ list_add(&dqp->q_hashlist, &h->qh_list);
+ h->qh_version++;
+ h->qh_nelems++;
}
STATIC void
xfs_qm_dqtest_print(
@@ -1061,9 +1055,7 @@ xfs_qm_internalqcheck_dqget(
xfs_dqhash_t *h;
h = DQTEST_HASH(mp, id, type);
- for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
- d = (xfs_dqtest_t *) d->HL_NEXT) {
- /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
+ list_for_each_entry(d, &h->qh_list, q_hashlist) {
if (d->d_id == id && mp == d->q_mount) {
*O_dq = d;
return (0);
@@ -1074,6 +1066,7 @@ xfs_qm_internalqcheck_dqget(
d->d_id = id;
d->q_mount = mp;
d->q_hash = h;
+ INIT_LIST_HEAD(&d->q_hashlist);
xfs_qm_hashinsert(h, d);
*O_dq = d;
return (0);
@@ -1180,8 +1173,6 @@ xfs_qm_internalqcheck(
xfs_ino_t lastino;
int done, count;
int i;
- xfs_dqtest_t *d, *e;
- xfs_dqhash_t *h1;
int error;
lastino = 0;
@@ -1221,19 +1212,18 @@ xfs_qm_internalqcheck(
}
cmn_err(CE_DEBUG, "Checking results against system dquots");
for (i = 0; i < qmtest_hashmask; i++) {
- h1 = &qmtest_udqtab[i];
- for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+ xfs_dqtest_t *d, *n;
+ xfs_dqhash_t *h;
+
+ h = &qmtest_udqtab[i];
+ list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
xfs_dqtest_cmp(d);
- e = (xfs_dqtest_t *) d->HL_NEXT;
kmem_free(d);
- d = e;
}
- h1 = &qmtest_gdqtab[i];
- for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+ h = &qmtest_gdqtab[i];
+ list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
xfs_dqtest_cmp(d);
- e = (xfs_dqtest_t *) d->HL_NEXT;
kmem_free(d);
- d = e;
}
}
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b28..94a3d92 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
*/
#define XFS_DQITER_MAP_SIZE 10
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk)
-
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t))
-
-#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock)
-
-#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-
-#define xfs_qm_mplist_lock(mp) \
- mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
- mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
- mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
- mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-
-#define xfs_qm_freelist_lock(qm) \
- mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
- mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
- mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
-
/*
* Hash into a bucket in the dquot hash table, based on <mp, id>.
*/
@@ -72,9 +35,6 @@
XFS_DQ_HASHVAL(mp, id)) : \
(xfs_Gqm->qm_grp_dqhtable + \
XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \
- XFS_IS_UQUOTA_ON(mp) : \
- XFS_IS_OQUOTA_ON(mp))
#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
!dqp->q_core.d_blk_hardlimit && \
!dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
!dqp->q_core.d_rtbcount && \
!dqp->q_core.d_icount)
-#define HL_PREVP dq_hashlist.ql_prevp
-#define HL_NEXT dq_hashlist.ql_next
-#define MPL_PREVP dq_mplist.ql_prevp
-#define MPL_NEXT dq_mplist.ql_next
-
-
-#define _LIST_REMOVE(h, dqp, PVP, NXT) \
- { \
- xfs_dquot_t *d; \
- if (((d) = (dqp)->NXT)) \
- (d)->PVP = (dqp)->PVP; \
- *((dqp)->PVP) = d; \
- (dqp)->NXT = NULL; \
- (dqp)->PVP = NULL; \
- (h)->qh_version++; \
- (h)->qh_nelems--; \
- }
-
-#define _LIST_INSERT(h, dqp, PVP, NXT) \
- { \
- xfs_dquot_t *d; \
- if (((d) = (h)->qh_next)) \
- (d)->PVP = &((dqp)->NXT); \
- (dqp)->NXT = d; \
- (dqp)->PVP = &((h)->qh_next); \
- (h)->qh_next = dqp; \
- (h)->qh_version++; \
- (h)->qh_nelems++; \
- }
-
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
- for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
- (dqp) = (dqp)->dq_flnext)
-
-#define XQM_HASHLIST_INSERT(h, dqp) \
- _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-
-#define XQM_FREELIST_INSERT(h, dqp) \
- xfs_qm_freelist_append(h, dqp)
-
-#define XQM_MPLIST_INSERT(h, dqp) \
- _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-
-#define XQM_HASHLIST_REMOVE(h, dqp) \
- _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp) \
- xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp) \
- { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
- XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-
-#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp))
-
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \
- (tp)->t_dqinfo->dqa_usrdquots : \
- (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp) \
- (!((dqp)->q_core.d_id))
-
#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
(((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
(((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75c..061d827 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- xfs_dq_logitem_t *lp;
+ xfs_dq_logitem_t *lp = &dqp->q_logitem;
- ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+ ASSERT(dqp->q_transp != tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
- lp = &dqp->q_logitem;
+ ASSERT(lp->qli_dquot == dqp);
/*
* Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
{
xfs_log_item_desc_t *lidp;
- ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+ ASSERT(dqp->q_transp == tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
int i;
xfs_dqtrx_t *qa;
- for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
- qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+ qa = XFS_QM_ISUDQ(dqp) ?
+ tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
- qa[i].qt_dquot == dqp) {
- return (&qa[i]);
- }
+ qa[i].qt_dquot == dqp)
+ return &qa[i];
}
- return (NULL);
+ return NULL;
}
/*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
break;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+ ASSERT(dqp->q_transp == tp);
/*
* adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
softlimit = q->qi_bsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_btimer);
warns = be16_to_cpu(dqp->q_core.d_bwarns);
- warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
resbcountp = &dqp->q_res_bcount;
} else {
ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
softlimit = q->qi_rtbsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
- warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
resbcountp = &dqp->q_res_rtbcount;
}
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
count = be64_to_cpu(dqp->q_core.d_icount);
timer = be32_to_cpu(dqp->q_core.d_itimer);
warns = be16_to_cpu(dqp->q_core.d_iwarns);
- warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (!hardlimit)
hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d..99587de 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
}
if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
ASSERT(ip->i_df.if_ext_max ==
XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e6..240340a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -372,12 +372,12 @@ xfs_buf_item_pin(
*/
STATIC void
xfs_buf_item_unpin(
- xfs_buf_log_item_t *bip,
- int stale)
+ xfs_buf_log_item_t *bip)
{
struct xfs_ail *ailp;
xfs_buf_t *bp;
int freed;
+ int stale = bip->bli_flags & XFS_BLI_STALE;
bp = bip->bli_buf;
ASSERT(bp != NULL);
@@ -428,40 +428,34 @@ xfs_buf_item_unpin_remove(
xfs_buf_log_item_t *bip,
xfs_trans_t *tp)
{
- xfs_buf_t *bp;
- xfs_log_item_desc_t *lidp;
- int stale = 0;
-
- bp = bip->bli_buf;
- /*
- * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
- */
+ /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
if ((atomic_read(&bip->bli_refcount) == 1) &&
(bip->bli_flags & XFS_BLI_STALE)) {
+ /*
+ * yes -- We can safely do some work here and then call
+ * buf_item_unpin to do the rest because we are
+ * are holding the buffer locked so no one else will be
+ * able to bump up the refcount. We have to remove the
+ * log item from the transaction as we are about to release
+ * our reference to the buffer. If we don't, the unlock that
+ * occurs later in the xfs_trans_uncommit() will try to
+ * reference the buffer which we no longer have a hold on.
+ */
+ struct xfs_log_item_desc *lidp;
+
ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
trace_xfs_buf_item_unpin_stale(bip);
- /*
- * yes -- clear the xaction descriptor in-use flag
- * and free the chunk if required. We can safely
- * do some work here and then call buf_item_unpin
- * to do the rest because if the if is true, then
- * we are holding the buffer locked so no one else
- * will be able to bump up the refcount.
- */
- lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
- stale = lidp->lid_flags & XFS_LID_BUF_STALE;
+ lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
xfs_trans_free_item(tp, lidp);
+
/*
- * Since the transaction no longer refers to the buffer,
- * the buffer should no longer refer to the transaction.
+ * Since the transaction no longer refers to the buffer, the
+ * buffer should no longer refer to the transaction.
*/
- XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+ XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
}
-
- xfs_buf_item_unpin(bip, stale);
-
- return;
+ xfs_buf_item_unpin(bip);
}
/*
@@ -675,7 +669,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_buf_item_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
xfs_buf_item_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -733,10 +727,7 @@ xfs_buf_item_init(
bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
KM_SLEEP);
- bip->bli_item.li_type = XFS_LI_BUF;
- bip->bli_item.li_ops = &xfs_buf_item_ops;
- bip->bli_item.li_mountp = mp;
- bip->bli_item.li_ailp = mp->m_ail;
+ xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
xfs_buf_hold(bp);
bip->bli_format.blf_type = XFS_LI_BUF;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34a..df44545 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone;
* have been logged.
* For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
*/
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
unsigned short blf_size; /* size of this item */
ushort blf_flags; /* misc state */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5..ef96175 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
void
xfs_error_report(
- char *tag,
- int level,
- xfs_mount_t *mp,
- char *fname,
- int linenum,
- inst_t *ra)
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
{
if (level <= xfs_error_level) {
xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
CE_ALERT, mp,
"XFS internal error %s at line %d of file %s. Caller 0x%p\n",
- tag, linenum, fname, ra);
+ tag, linenum, filename, ra);
xfs_stack_trace();
}
@@ -205,15 +205,15 @@ xfs_error_report(
void
xfs_corruption_error(
- char *tag,
- int level,
- xfs_mount_t *mp,
- void *p,
- char *fname,
- int linenum,
- inst_t *ra)
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ void *p,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
{
if (level <= xfs_error_level)
xfs_hex_dump(p, 16);
- xfs_error_report(tag, level, mp, fname, linenum, ra);
+ xfs_error_report(tag, level, mp, filename, linenum, ra);
}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051..c2c1a07 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int xfs_error_trap(int);
struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
- char *fname, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
- void *p, char *fname, int linenum, inst_t *ra);
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+ const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+ struct xfs_mount *mp, void *p, const char *filename,
+ int linenum, inst_t *ra);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1..409fe81 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
*/
/*ARGSUSED*/
STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
{
struct xfs_ail *ailp = efip->efi_item.li_ailp;
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_efi_item_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
xfs_efi_item_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp,
KM_SLEEP);
}
- efip->efi_item.li_type = XFS_LI_EFI;
- efip->efi_item.li_ops = &xfs_efi_item_ops;
- efip->efi_item.li_mountp = mp;
- efip->efi_item.li_ailp = mp->m_ail;
+ xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
*/
/*ARGSUSED*/
STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
{
return;
}
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_efd_item_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
xfs_efd_item_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp,
KM_SLEEP);
}
- efdp->efd_item.li_type = XFS_LI_EFD;
- efdp->efd_item.li_ops = &xfs_efd_item_ops;
- efdp->efd_item.li_mountp = mp;
- efdp->efd_item.li_ailp = mp->m_ail;
+ xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
efdp->efd_efip = efip;
efdp->efd_format.efd_nextents = nextents;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd564..8cd6e8d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
{
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+
/* Give the log a push to start the unpinning I/O */
xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea85..cf8249a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
{
ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+ trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
atomic_inc(&iip->ili_inode->i_pincount);
}
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
/* ARGSUSED */
STATIC void
xfs_inode_item_unpin(
- xfs_inode_log_item_t *iip,
- int stale)
+ xfs_inode_log_item_t *iip)
{
struct xfs_inode *ip = iip->ili_inode;
+ trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
xfs_inode_log_item_t *iip,
xfs_trans_t *tp)
{
- xfs_inode_item_unpin(iip, 0);
+ xfs_inode_item_unpin(iip);
}
/*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
.iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
xfs_inode_item_format,
.iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
- .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+ .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
xfs_inode_item_unpin_remove,
.iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
ASSERT(ip->i_itemp == NULL);
iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
- iip->ili_item.li_type = XFS_LI_INODE;
- iip->ili_item.li_ops = &xfs_inode_item_ops;
- iip->ili_item.li_mountp = mp;
- iip->ili_item.li_ailp = mp->m_ail;
iip->ili_inode = ip;
-
- /*
- We have zeroed memory. No need ...
- iip->ili_extents_buf = NULL;
- */
-
+ xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+ &xfs_inode_item_ops);
iip->ili_format.ilf_type = XFS_LI_INODE;
iip->ili_format.ilf_ino = ip->i_ino;
iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b650399..ef14943 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
#define XFS_STRAT_WRITE_IMAPS 2
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
-STATIC int
-xfs_imap_to_bmap(
- xfs_inode_t *ip,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- xfs_iomap_t *iomapp,
- int imaps, /* Number of imap entries */
- int iomaps, /* Number of iomap entries */
- int flags)
-{
- xfs_mount_t *mp = ip->i_mount;
- int pbm;
- xfs_fsblock_t start_block;
-
-
- for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
- iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomapp->iomap_delta = offset - iomapp->iomap_offset;
- iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomapp->iomap_flags = flags;
-
- if (XFS_IS_REALTIME_INODE(ip)) {
- iomapp->iomap_flags |= IOMAP_REALTIME;
- iomapp->iomap_target = mp->m_rtdev_targp;
- } else {
- iomapp->iomap_target = mp->m_ddev_targp;
- }
- start_block = imap->br_startblock;
- if (start_block == HOLESTARTBLOCK) {
- iomapp->iomap_bn = IOMAP_DADDR_NULL;
- iomapp->iomap_flags |= IOMAP_HOLE;
- } else if (start_block == DELAYSTARTBLOCK) {
- iomapp->iomap_bn = IOMAP_DADDR_NULL;
- iomapp->iomap_flags |= IOMAP_DELAY;
- } else {
- iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
- if (ISUNWRITTEN(imap))
- iomapp->iomap_flags |= IOMAP_UNWRITTEN;
- }
-
- offset += iomapp->iomap_bsize - iomapp->iomap_delta;
- }
- return pbm; /* Return the number filled */
-}
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+ int, struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
+ struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *, int *);
int
xfs_iomap(
- xfs_inode_t *ip,
- xfs_off_t offset,
- ssize_t count,
- int flags,
- xfs_iomap_t *iomapp,
- int *niomaps)
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ ssize_t count,
+ int flags,
+ struct xfs_bmbt_irec *imap,
+ int *nimaps,
+ int *new)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb, end_fsb;
- int error = 0;
- int lockmode = 0;
- xfs_bmbt_irec_t imap;
- int nimaps = 1;
- int bmapi_flags = 0;
- int iomap_flags = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int lockmode = 0;
+ int bmapi_flags = 0;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+ *new = 0;
+
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -160,8 +122,8 @@ xfs_iomap(
error = xfs_bmapi(NULL, ip, offset_fsb,
(xfs_filblks_t)(end_fsb - offset_fsb),
- bmapi_flags, NULL, 0, &imap,
- &nimaps, NULL, NULL);
+ bmapi_flags, NULL, 0, imap,
+ nimaps, NULL, NULL);
if (error)
goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
case BMAPI_WRITE:
/* If we found an extent, return it */
- if (nimaps &&
- (imap.br_startblock != HOLESTARTBLOCK) &&
- (imap.br_startblock != DELAYSTARTBLOCK)) {
- trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+ if (*nimaps &&
+ (imap->br_startblock != HOLESTARTBLOCK) &&
+ (imap->br_startblock != DELAYSTARTBLOCK)) {
+ trace_xfs_iomap_found(ip, offset, count, flags, imap);
break;
}
if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
error = xfs_iomap_write_direct(ip, offset, count, flags,
- &imap, &nimaps, nimaps);
+ imap, nimaps);
} else {
error = xfs_iomap_write_delay(ip, offset, count, flags,
- &imap, &nimaps);
+ imap, nimaps);
}
if (!error) {
- trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+ trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
}
- iomap_flags = IOMAP_NEW;
+ *new = 1;
break;
case BMAPI_ALLOCATE:
/* If we found an extent, return it */
xfs_iunlock(ip, lockmode);
lockmode = 0;
- if (nimaps && !isnullstartblock(imap.br_startblock)) {
- trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+ if (*nimaps && !isnullstartblock(imap->br_startblock)) {
+ trace_xfs_iomap_found(ip, offset, count, flags, imap);
break;
}
error = xfs_iomap_write_allocate(ip, offset, count,
- &imap, &nimaps);
+ imap, nimaps);
break;
}
- if (nimaps) {
- *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
- iomapp, nimaps, *niomaps, iomap_flags);
- } else if (niomaps) {
- *niomaps = 0;
- }
+ ASSERT(*nimaps <= 1);
out:
if (lockmode)
@@ -216,7 +173,6 @@ out:
return XFS_ERROR(error);
}
-
STATIC int
xfs_iomap_eof_align_last_fsb(
xfs_mount_t *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
return EFSCORRUPTED;
}
-int
+STATIC int
xfs_iomap_write_direct(
xfs_inode_t *ip,
xfs_off_t offset,
size_t count,
int flags,
xfs_bmbt_irec_t *ret_imap,
- int *nmaps,
- int found)
+ int *nmaps)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
if (error)
goto error_out;
} else {
- if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+ if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
ret_imap->br_blockcount +
ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
return 0;
}
-int
+STATIC int
xfs_iomap_write_delay(
xfs_inode_t *ip,
xfs_off_t offset,
@@ -588,7 +543,7 @@ retry:
* We no longer bother to look at the incoming map - all we have to
* guarantee is that whatever we allocate fills the required range.
*/
-int
+STATIC int
xfs_iomap_write_allocate(
xfs_inode_t *ip,
xfs_off_t offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f299..81ac4af 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
#ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
-
-
-typedef enum { /* iomap_flags values */
- IOMAP_READ = 0, /* mapping for a read */
- IOMAP_HOLE = 0x02, /* mapping covers a hole */
- IOMAP_DELAY = 0x04, /* mapping covers delalloc region */
- IOMAP_REALTIME = 0x10, /* mapping on the realtime device */
- IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */
- /* but uninitialized file data */
- IOMAP_NEW = 0x40 /* just allocate */
-} iomap_flags_t;
-
typedef enum {
/* base extent manipulation calls */
BMAPI_READ = (1 << 0), /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
{ BMAPI_MMAP, "MMAP" }, \
{ BMAPI_TRYLOCK, "TRYLOCK" }
-/*
- * xfs_iomap_t: File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping, iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-
-typedef struct xfs_iomap {
- xfs_daddr_t iomap_bn; /* first 512B blk of mapping */
- xfs_buftarg_t *iomap_target;
- xfs_off_t iomap_offset; /* offset of mapping, bytes */
- xfs_off_t iomap_bsize; /* size of mapping, bytes */
- xfs_off_t iomap_delta; /* offset into mapping, bytes */
- iomap_flags_t iomap_flags;
-} xfs_iomap_t;
-
struct xfs_inode;
struct xfs_bmbt_irec;
extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
- struct xfs_iomap *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
- int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
- struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *, int *);
+ struct xfs_bmbt_irec *, int *, int *);
extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2be0191..3038dd5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
kmem_zone_t *xfs_log_ticket_zone;
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
- { (ptr) += (bytes); \
- (len) -= (bytes); \
- (off) += (bytes);}
-
/* Local miscellaneous function prototypes */
-STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
xlog_in_core_t **, xfs_lsn_t *);
STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
xfs_buftarg_t *log_target,
@@ -59,11 +54,9 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
STATIC void xlog_dealloc_log(xlog_t *log);
-STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
- int nentries, struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- xlog_in_core_t **commit_iclog,
- uint flags);
+STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ xlog_in_core_t **commit_iclog, uint flags);
/* local state machine functions */
STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -102,7 +95,7 @@ STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
uint flags);
#if defined(DEBUG)
-STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
int count, boolean_t syncing);
@@ -258,7 +251,7 @@ xfs_log_done(
* If we get an error, just continue and give back the log ticket.
*/
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
+ (xlog_commit_record(log, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
flags |= XFS_LOG_REL_PERM_RESERV;
@@ -516,18 +509,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
#ifdef DEBUG
xlog_in_core_t *first_iclog;
#endif
- xfs_log_iovec_t reg[1];
xlog_ticket_t *tic = NULL;
xfs_lsn_t lsn;
int error;
- /* the data section must be 32 bit size aligned */
- struct {
- __uint16_t magic;
- __uint16_t pad1;
- __uint32_t pad2; /* may as well make it 64 bits */
- } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
-
/*
* Don't write out unmount record on read-only mounts.
* Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +534,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} while (iclog != first_iclog);
#endif
if (! (XLOG_FORCED_SHUTDOWN(log))) {
- reg[0].i_addr = (void*)&magic;
- reg[0].i_len = sizeof(magic);
- reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
-
error = xfs_log_reserve(mp, 600, 1, &tic,
XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
if (!error) {
+ /* the data section must be 32 bit size aligned */
+ struct {
+ __uint16_t magic;
+ __uint16_t pad1;
+ __uint32_t pad2; /* may as well make it 64 bits */
+ } magic = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ };
+ struct xfs_log_iovec reg = {
+ .i_addr = (void *)&magic,
+ .i_len = sizeof(magic),
+ .i_type = XLOG_REG_TYPE_UNMOUNT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
+
/* remove inited flag */
- ((xlog_ticket_t *)tic)->t_flags = 0;
- error = xlog_write(mp, reg, 1, tic, &lsn,
+ tic->t_flags = 0;
+ error = xlog_write(log, &vec, tic, &lsn,
NULL, XLOG_UNMOUNT_TRANS);
/*
* At this point, we're umounting anyway,
@@ -648,10 +647,26 @@ xfs_log_unmount(xfs_mount_t *mp)
xlog_dealloc_log(mp->m_log);
}
+void
+xfs_log_item_init(
+ struct xfs_mount *mp,
+ struct xfs_log_item *item,
+ int type,
+ struct xfs_item_ops *ops)
+{
+ item->li_mountp = mp;
+ item->li_ailp = mp->m_ail;
+ item->li_type = type;
+ item->li_ops = ops;
+}
+
/*
* Write region vectors to log. The write happens using the space reservation
* of the ticket (tic). It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
*/
int
xfs_log_write(
@@ -663,11 +678,15 @@ xfs_log_write(
{
struct log *log = mp->m_log;
int error;
+ struct xfs_log_vec vec = {
+ .lv_niovecs = nentries,
+ .lv_iovecp = reg,
+ };
if (XLOG_FORCED_SHUTDOWN(log))
return XFS_ERROR(EIO);
- error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+ error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
if (error)
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
return error;
@@ -1020,6 +1039,7 @@ xlog_alloc_log(xfs_mount_t *mp,
int i;
int iclogsize;
int error = ENOMEM;
+ uint log2_size = 0;
log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
if (!log) {
@@ -1045,29 +1065,30 @@ xlog_alloc_log(xfs_mount_t *mp,
error = EFSCORRUPTED;
if (xfs_sb_version_hassector(&mp->m_sb)) {
- log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
- if (log->l_sectbb_log < 0 ||
- log->l_sectbb_log > mp->m_sectbb_log) {
- xlog_warn("XFS: Log sector size (0x%x) out of range.",
- log->l_sectbb_log);
+ log2_size = mp->m_sb.sb_logsectlog;
+ if (log2_size < BBSHIFT) {
+ xlog_warn("XFS: Log sector size too small "
+ "(0x%x < 0x%x)", log2_size, BBSHIFT);
goto out_free_log;
}
- /* for larger sector sizes, must have v2 or external log */
- if (log->l_sectbb_log != 0 &&
- (log->l_logBBstart != 0 &&
- !xfs_sb_version_haslogv2(&mp->m_sb))) {
- xlog_warn("XFS: log sector size (0x%x) invalid "
- "for configuration.", log->l_sectbb_log);
+ log2_size -= BBSHIFT;
+ if (log2_size > mp->m_sectbb_log) {
+ xlog_warn("XFS: Log sector size too large "
+ "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
goto out_free_log;
}
- if (mp->m_sb.sb_logsectlog < BBSHIFT) {
- xlog_warn("XFS: Log sector log (0x%x) too small.",
- mp->m_sb.sb_logsectlog);
+
+ /* for larger sector sizes, must have v2 or external log */
+ if (log2_size && log->l_logBBstart > 0 &&
+ !xfs_sb_version_haslogv2(&mp->m_sb)) {
+
+ xlog_warn("XFS: log sector size (0x%x) invalid "
+ "for configuration.", log2_size);
goto out_free_log;
}
}
- log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+ log->l_sectBBsize = 1 << log2_size;
xlog_get_iclog_buffer_size(mp, log);
@@ -1174,26 +1195,31 @@ out:
* ticket. Return the lsn of the commit record.
*/
STATIC int
-xlog_commit_record(xfs_mount_t *mp,
- xlog_ticket_t *ticket,
- xlog_in_core_t **iclog,
- xfs_lsn_t *commitlsnp)
+xlog_commit_record(
+ struct log *log,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ xfs_lsn_t *commitlsnp)
{
- int error;
- xfs_log_iovec_t reg[1];
-
- reg[0].i_addr = NULL;
- reg[0].i_len = 0;
- reg[0].i_type = XLOG_REG_TYPE_COMMIT;
+ struct xfs_mount *mp = log->l_mp;
+ int error;
+ struct xfs_log_iovec reg = {
+ .i_addr = NULL,
+ .i_len = 0,
+ .i_type = XLOG_REG_TYPE_COMMIT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
ASSERT_ALWAYS(iclog);
- if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
- iclog, XLOG_COMMIT_TRANS))) {
+ error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
+ XLOG_COMMIT_TRANS);
+ if (error)
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- }
return error;
-} /* xlog_commit_record */
-
+}
/*
* Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1614,6 +1640,192 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
}
/*
+ * Calculate the potential space needed by the log vector. Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+ struct xlog_ticket *ticket,
+ struct xfs_log_vec *log_vector)
+{
+ struct xfs_log_vec *lv;
+ int headers = 0;
+ int len = 0;
+ int i;
+
+ /* acct for start rec of xact */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ headers++;
+
+ for (lv = log_vector; lv; lv = lv->lv_next) {
+ headers += lv->lv_niovecs;
+
+ for (i = 0; i < lv->lv_niovecs; i++) {
+ struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+
+ len += vecp->i_len;
+ xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+ }
+ }
+
+ ticket->t_res_num_ophdrs += headers;
+ len += headers * sizeof(struct xlog_op_header);
+
+ return len;
+}
+
+/*
+ * If first write for transaction, insert start record We can't be trying to
+ * commit if we are inited. We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+ struct xlog_op_header *ophdr,
+ struct xlog_ticket *ticket)
+{
+ if (!(ticket->t_flags & XLOG_TIC_INITED))
+ return 0;
+
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = ticket->t_clientid;
+ ophdr->oh_len = 0;
+ ophdr->oh_flags = XLOG_START_TRANS;
+ ophdr->oh_res2 = 0;
+
+ ticket->t_flags &= ~XLOG_TIC_INITED;
+
+ return sizeof(struct xlog_op_header);
+}
+
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+ struct log *log,
+ struct xlog_op_header *ophdr,
+ struct xlog_ticket *ticket,
+ uint flags)
+{
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = ticket->t_clientid;
+ ophdr->oh_res2 = 0;
+
+ /* are we copying a commit or unmount record? */
+ ophdr->oh_flags = flags;
+
+ /*
+ * We've seen logs corrupted with bad transaction client ids. This
+ * makes sure that XFS doesn't generate them on. Turn this into an EIO
+ * and shut down the filesystem.
+ */
+ switch (ophdr->oh_clientid) {
+ case XFS_TRANSACTION:
+ case XFS_VOLUME:
+ case XFS_LOG:
+ break;
+ default:
+ xfs_fs_cmn_err(CE_WARN, log->l_mp,
+ "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+ ophdr->oh_clientid, ticket);
+ return NULL;
+ }
+
+ return ophdr;
+}
+
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+ struct xlog_ticket *ticket,
+ struct xlog_op_header *ophdr,
+ int space_available,
+ int space_required,
+ int *copy_off,
+ int *copy_len,
+ int *last_was_partial_copy,
+ int *bytes_consumed)
+{
+ int still_to_copy;
+
+ still_to_copy = space_required - *bytes_consumed;
+ *copy_off = *bytes_consumed;
+
+ if (still_to_copy <= space_available) {
+ /* write of region completes here */
+ *copy_len = still_to_copy;
+ ophdr->oh_len = cpu_to_be32(*copy_len);
+ if (*last_was_partial_copy)
+ ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+ *last_was_partial_copy = 0;
+ *bytes_consumed = 0;
+ return 0;
+ }
+
+ /* partial write of region, needs extra log op header reservation */
+ *copy_len = space_available;
+ ophdr->oh_len = cpu_to_be32(*copy_len);
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+ if (*last_was_partial_copy)
+ ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+ *bytes_consumed += *copy_len;
+ (*last_was_partial_copy)++;
+
+ /* account for new log op header */
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ ticket->t_res_num_ophdrs++;
+
+ return sizeof(struct xlog_op_header);
+}
+
+static int
+xlog_write_copy_finish(
+ struct log *log,
+ struct xlog_in_core *iclog,
+ uint flags,
+ int *record_cnt,
+ int *data_cnt,
+ int *partial_copy,
+ int *partial_copy_len,
+ int log_offset,
+ struct xlog_in_core **commit_iclog)
+{
+ if (*partial_copy) {
+ /*
+ * This iclog has already been marked WANT_SYNC by
+ * xlog_state_get_iclog_space.
+ */
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ *record_cnt = 0;
+ *data_cnt = 0;
+ return xlog_state_release_iclog(log, iclog);
+ }
+
+ *partial_copy = 0;
+ *partial_copy_len = 0;
+
+ if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+ /* no more space in this iclog - push it. */
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ *record_cnt = 0;
+ *data_cnt = 0;
+
+ spin_lock(&log->l_icloglock);
+ xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
+
+ if (!commit_iclog)
+ return xlog_state_release_iclog(log, iclog);
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ }
+
+ return 0;
+}
+
+/*
* Write some region out to in-core log
*
* This will be called when writing externally provided regions or when
@@ -1655,209 +1867,157 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
*/
STATIC int
xlog_write(
- struct xfs_mount *mp,
- struct xfs_log_iovec reg[],
- int nentries,
+ struct log *log,
+ struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
uint flags)
{
- xlog_t *log = mp->m_log;
- xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
- xlog_op_header_t *logop_head; /* ptr to log operation header */
- __psint_t ptr; /* copy address into data region */
- int len; /* # xlog_write() bytes 2 still copy */
- int index; /* region index currently copying */
- int log_offset; /* offset (from 0) into data region */
- int start_rec_copy; /* # bytes to copy for start record */
- int partial_copy; /* did we split a region? */
- int partial_copy_len;/* # bytes copied if split region */
- int need_copy; /* # bytes need to memcpy this region */
- int copy_len; /* # bytes actually memcpy'ing */
- int copy_off; /* # bytes from entry start */
- int contwr; /* continued write of in-core log? */
- int error;
- int record_cnt = 0, data_cnt = 0;
-
- partial_copy_len = partial_copy = 0;
-
- /* Calculate potential maximum space. Each region gets its own
- * xlog_op_header_t and may need to be double word aligned.
- */
- len = 0;
- if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */
- len += sizeof(xlog_op_header_t);
- ticket->t_res_num_ophdrs++;
- }
+ struct xlog_in_core *iclog = NULL;
+ struct xfs_log_iovec *vecp;
+ struct xfs_log_vec *lv;
+ int len;
+ int index;
+ int partial_copy = 0;
+ int partial_copy_len = 0;
+ int contwr = 0;
+ int record_cnt = 0;
+ int data_cnt = 0;
+ int error;
- for (index = 0; index < nentries; index++) {
- len += sizeof(xlog_op_header_t); /* each region gets >= 1 */
- ticket->t_res_num_ophdrs++;
- len += reg[index].i_len;
- xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
- }
- contwr = *start_lsn = 0;
+ *start_lsn = 0;
- if (ticket->t_curr_res < len) {
- xlog_print_tic_res(mp, ticket);
+ len = xlog_write_calc_vec_length(ticket, log_vector);
+ if (ticket->t_curr_res < len) {
+ xlog_print_tic_res(log->l_mp, ticket);
#ifdef DEBUG
- xlog_panic(
- "xfs_log_write: reservation ran out. Need to up reservation");
+ xlog_panic(
+ "xfs_log_write: reservation ran out. Need to up reservation");
#else
- /* Customer configurable panic */
- xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
- "xfs_log_write: reservation ran out. Need to up reservation");
- /* If we did not panic, shutdown the filesystem */
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ /* Customer configurable panic */
+ xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
+ "xfs_log_write: reservation ran out. Need to up reservation");
+
+ /* If we did not panic, shutdown the filesystem */
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
#endif
- } else
+ }
+
ticket->t_curr_res -= len;
- for (index = 0; index < nentries; ) {
- if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset)))
- return error;
+ index = 0;
+ lv = log_vector;
+ vecp = lv->lv_iovecp;
+ while (lv && index < lv->lv_niovecs) {
+ void *ptr;
+ int log_offset;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &contwr, &log_offset);
+ if (error)
+ return error;
- /* start_lsn is the first lsn written to. That's all we need. */
- if (! *start_lsn)
- *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ ASSERT(log_offset <= iclog->ic_size - 1);
+ ptr = iclog->ic_datap + log_offset;
- /* This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
- */
- while (index < nentries) {
- ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
- ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
- start_rec_copy = 0;
-
- /* If first write for transaction, insert start record.
- * We can't be trying to commit if we are inited. We can't
- * have any "partial_copy" if we are inited.
- */
- if (ticket->t_flags & XLOG_TIC_INITED) {
- logop_head = (xlog_op_header_t *)ptr;
- logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
- logop_head->oh_clientid = ticket->t_clientid;
- logop_head->oh_len = 0;
- logop_head->oh_flags = XLOG_START_TRANS;
- logop_head->oh_res2 = 0;
- ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */
- record_cnt++;
-
- start_rec_copy = sizeof(xlog_op_header_t);
- xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
- }
+ /* start_lsn is the first lsn written to. That's all we need. */
+ if (!*start_lsn)
+ *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- /* Copy log operation header directly into data section */
- logop_head = (xlog_op_header_t *)ptr;
- logop_head->oh_tid = cpu_to_be32(ticket->t_tid);
- logop_head->oh_clientid = ticket->t_clientid;
- logop_head->oh_res2 = 0;
+ /*
+ * This loop writes out as many regions as can fit in the amount
+ * of space which was allocated by xlog_state_get_iclog_space().
+ */
+ while (lv && index < lv->lv_niovecs) {
+ struct xfs_log_iovec *reg = &vecp[index];
+ struct xlog_op_header *ophdr;
+ int start_rec_copy;
+ int copy_len;
+ int copy_off;
+
+ ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+ ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+
+ start_rec_copy = xlog_write_start_rec(ptr, ticket);
+ if (start_rec_copy) {
+ record_cnt++;
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ start_rec_copy);
+ }
- /* header copied directly */
- xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+ ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+ if (!ophdr)
+ return XFS_ERROR(EIO);
- /* are we copying a commit or unmount record? */
- logop_head->oh_flags = flags;
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ sizeof(struct xlog_op_header));
+
+ len += xlog_write_setup_copy(ticket, ophdr,
+ iclog->ic_size-log_offset,
+ reg->i_len,
+ &copy_off, &copy_len,
+ &partial_copy,
+ &partial_copy_len);
+ xlog_verify_dest_ptr(log, ptr);
+
+ /* copy region */
+ ASSERT(copy_len >= 0);
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+
+ copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ record_cnt++;
+ data_cnt += contwr ? copy_len : 0;
+
+ error = xlog_write_copy_finish(log, iclog, flags,
+ &record_cnt, &data_cnt,
+ &partial_copy,
+ &partial_copy_len,
+ log_offset,
+ commit_iclog);
+ if (error)
+ return error;
- /*
- * We've seen logs corrupted with bad transaction client
- * ids. This makes sure that XFS doesn't generate them on.
- * Turn this into an EIO and shut down the filesystem.
- */
- switch (logop_head->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_fs_cmn_err(CE_WARN, mp,
- "Bad XFS transaction clientid 0x%x in ticket 0x%p",
- logop_head->oh_clientid, ticket);
- return XFS_ERROR(EIO);
- }
+ /*
+ * if we had a partial copy, we need to get more iclog
+ * space but we don't want to increment the region
+ * index because there is still more is this region to
+ * write.
+ *
+ * If we completed writing this region, and we flushed
+ * the iclog (indicated by resetting of the record
+ * count), then we also need to get more log space. If
+ * this was the last record, though, we are done and
+ * can just return.
+ */
+ if (partial_copy)
+ break;
- /* Partial write last time? => (partial_copy != 0)
- * need_copy is the amount we'd like to copy if everything could
- * fit in the current memcpy.
- */
- need_copy = reg[index].i_len - partial_copy_len;
-
- copy_off = partial_copy_len;
- if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
- copy_len = need_copy;
- logop_head->oh_len = cpu_to_be32(copy_len);
- if (partial_copy)
- logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- partial_copy_len = partial_copy = 0;
- } else { /* partial write */
- copy_len = iclog->ic_size - log_offset;
- logop_head->oh_len = cpu_to_be32(copy_len);
- logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
- if (partial_copy)
- logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
- partial_copy_len += copy_len;
- partial_copy++;
- len += sizeof(xlog_op_header_t); /* from splitting of region */
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
- ticket->t_res_num_ophdrs++;
- }
- xlog_verify_dest_ptr(log, ptr);
-
- /* copy region */
- ASSERT(copy_len >= 0);
- memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-
- /* make copy_len total bytes copied, including headers */
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
- record_cnt++;
- data_cnt += contwr ? copy_len : 0;
- if (partial_copy) { /* copied partial region */
- /* already marked WANT_SYNC by xlog_state_get_iclog_space */
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- record_cnt = data_cnt = 0;
- if ((error = xlog_state_release_iclog(log, iclog)))
- return error;
- break; /* don't increment index */
- } else { /* copied entire region */
- index++;
- partial_copy_len = partial_copy = 0;
-
- if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- record_cnt = data_cnt = 0;
- spin_lock(&log->l_icloglock);
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
- if (commit_iclog) {
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- } else if ((error = xlog_state_release_iclog(log, iclog)))
- return error;
- if (index == nentries)
- return 0; /* we are done */
- else
- break;
+ if (++index == lv->lv_niovecs) {
+ lv = lv->lv_next;
+ index = 0;
+ if (lv)
+ vecp = lv->lv_iovecp;
+ }
+ if (record_cnt == 0) {
+ if (!lv)
+ return 0;
+ break;
+ }
}
- } /* if (partial_copy) */
- } /* while (index < nentries) */
- } /* for (index = 0; index < nentries; ) */
- ASSERT(len == 0);
+ }
+
+ ASSERT(len == 0);
+
+ xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+ if (!commit_iclog)
+ return xlog_state_release_iclog(log, iclog);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (commit_iclog) {
ASSERT(flags & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
return 0;
- }
- return xlog_state_release_iclog(log, iclog);
-} /* xlog_write */
+}
/*****************************************************************************
@@ -3157,14 +3317,16 @@ xfs_log_ticket_get(
* Allocate and initialise a new log ticket.
*/
STATIC xlog_ticket_t *
-xlog_ticket_alloc(xlog_t *log,
- int unit_bytes,
- int cnt,
- char client,
- uint xflags)
+xlog_ticket_alloc(
+ struct log *log,
+ int unit_bytes,
+ int cnt,
+ char client,
+ uint xflags)
{
- xlog_ticket_t *tic;
+ struct xlog_ticket *tic;
uint num_headers;
+ int iclog_space;
tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
if (!tic)
@@ -3208,16 +3370,40 @@ xlog_ticket_alloc(xlog_t *log,
/* for start-rec */
unit_bytes += sizeof(xlog_op_header_t);
- /* for LR headers */
- num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+ /*
+ * for LR headers - the space for data in an iclog is the size minus
+ * the space used for the headers. If we use the iclog size, then we
+ * undercalculate the number of headers required.
+ *
+ * Furthermore - the addition of op headers for split-recs might
+ * increase the space required enough to require more log and op
+ * headers, so take that into account too.
+ *
+ * IMPORTANT: This reservation makes the assumption that if this
+ * transaction is the first in an iclog and hence has the LR headers
+ * accounted to it, then the remaining space in the iclog is
+ * exclusively for this transaction. i.e. if the transaction is larger
+ * than the iclog, it will be the only thing in that iclog.
+ * Fundamentally, this means we must pass the entire log vector to
+ * xlog_write to guarantee this.
+ */
+ iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+ num_headers = howmany(unit_bytes, iclog_space);
+
+ /* for split-recs - ophdrs added when data split over LRs */
+ unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+ /* add extra header reservations if we overrun */
+ while (!num_headers ||
+ howmany(unit_bytes, iclog_space) > num_headers) {
+ unit_bytes += sizeof(xlog_op_header_t);
+ num_headers++;
+ }
unit_bytes += log->l_iclog_hsize * num_headers;
/* for commit-rec LR header - note: padding will subsume the ophdr */
unit_bytes += log->l_iclog_hsize;
- /* for split-recs - ophdrs added when data split over LRs */
- unit_bytes += sizeof(xlog_op_header_t) * num_headers;
-
/* for roundoff padding for transaction data and one for commit record */
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3233,13 +3419,13 @@ xlog_ticket_alloc(xlog_t *log,
tic->t_curr_res = unit_bytes;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+ tic->t_tid = random32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
tic->t_trans_type = 0;
if (xflags & XFS_LOG_PERM_RESERV)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+ sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
xlog_tic_reset_res(tic);
@@ -3260,20 +3446,22 @@ xlog_ticket_alloc(xlog_t *log,
* part of the log in case we trash the log structure.
*/
void
-xlog_verify_dest_ptr(xlog_t *log,
- __psint_t ptr)
+xlog_verify_dest_ptr(
+ struct log *log,
+ char *ptr)
{
int i;
int good_ptr = 0;
- for (i=0; i < log->l_iclog_bufs; i++) {
- if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
- ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ if (ptr >= log->l_iclog_bak[i] &&
+ ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
good_ptr++;
}
- if (! good_ptr)
+
+ if (!good_ptr)
xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-} /* xlog_verify_dest_ptr */
+}
STATIC void
xlog_verify_grant_head(xlog_t *log, int equals)
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7..229d1f3 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,6 +110,12 @@ typedef struct xfs_log_iovec {
uint i_type; /* type of region */
} xfs_log_iovec_t;
+struct xfs_log_vec {
+ struct xfs_log_vec *lv_next; /* next lv in build list */
+ int lv_niovecs; /* number of iovecs in lv */
+ struct xfs_log_iovec *lv_iovecp; /* iovec array */
+};
+
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
@@ -126,6 +132,13 @@ typedef struct xfs_log_callback {
struct xfs_mount;
struct xlog_in_core;
struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+
+void xfs_log_item_init(struct xfs_mount *mp,
+ struct xfs_log_item *item,
+ int type,
+ struct xfs_item_ops *ops);
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18..9cf6951 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -396,9 +396,7 @@ typedef struct log {
struct xfs_buf_cancel **l_buf_cancel_table;
int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */
- uint l_sectbb_log; /* log2 of sector size in BBs */
- uint l_sectbb_mask; /* sector size (in BBs)
- * alignment mask */
+ uint l_sectBBsize; /* sector size in BBs (2^n) */
int l_iclog_size; /* size of log in bytes */
int l_iclog_size_log; /* log power size of log */
int l_iclog_bufs; /* number of iclog buffers */
@@ -449,6 +447,14 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
extern kmem_zone_t *xfs_log_ticket_zone;
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+ *ptr += bytes;
+ *len -= bytes;
+ *off += bytes;
+}
+
/*
* Unmount record type is used as a pseudo transaction type for the ticket.
* It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efd..0de08e36 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *);
#define xlog_recover_check_summary(log)
#endif
-
/*
* Sector aligned buffer routines for buffer create/read/write/access
*/
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \
- ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
- ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
+/*
+ * Verify the given count of basic blocks is valid number of blocks
+ * to specify for an operation involving the given XFS log buffer.
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+static inline int
+xlog_buf_bbcount_valid(
+ xlog_t *log,
+ int bbcount)
+{
+ return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+
+/*
+ * Allocate a buffer to hold log data. The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
STATIC xfs_buf_t *
xlog_get_bp(
xlog_t *log,
int nbblks)
{
- if (nbblks <= 0 || nbblks > log->l_logBBsize) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
- XFS_ERROR_REPORT("xlog_get_bp(1)",
- XFS_ERRLEVEL_HIGH, log->l_mp);
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return NULL;
}
- if (log->l_sectbb_log) {
- if (nbblks > 1)
- nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
- nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
- }
+ /*
+ * We do log I/O in units of log sectors (a power-of-2
+ * multiple of the basic block size), so we round up the
+ * requested size to acommodate the basic blocks required
+ * for complete log sectors.
+ *
+ * In addition, the buffer may be used for a non-sector-
+ * aligned block offset, in which case an I/O of the
+ * requested size could extend beyond the end of the
+ * buffer. If the requested size is only 1 basic block it
+ * will never straddle a sector boundary, so this won't be
+ * an issue. Nor will this be a problem if the log I/O is
+ * done in basic blocks (sector size 1). But otherwise we
+ * extend the buffer by one extra log sector to ensure
+ * there's space to accomodate this possiblility.
+ */
+ if (nbblks > 1 && log->l_sectBBsize > 1)
+ nbblks += log->l_sectBBsize;
+ nbblks = round_up(nbblks, log->l_sectBBsize);
+
return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
}
@@ -93,6 +121,10 @@ xlog_put_bp(
xfs_buf_free(bp);
}
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer. The buffer covers a log sector-aligned region.
+ */
STATIC xfs_caddr_t
xlog_align(
xlog_t *log,
@@ -100,14 +132,14 @@ xlog_align(
int nbblks,
xfs_buf_t *bp)
{
+ xfs_daddr_t offset;
xfs_caddr_t ptr;
- if (!log->l_sectbb_log)
- return XFS_BUF_PTR(bp);
+ offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
+ ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+
+ ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
- ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
- ASSERT(XFS_BUF_SIZE(bp) >=
- BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
return ptr;
}
@@ -124,21 +156,18 @@ xlog_bread_noalign(
{
int error;
- if (nbblks <= 0 || nbblks > log->l_logBBsize) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
- XFS_ERROR_REPORT("xlog_bread(1)",
- XFS_ERRLEVEL_HIGH, log->l_mp);
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return EFSCORRUPTED;
}
- if (log->l_sectbb_log) {
- blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
- nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
- }
+ blk_no = round_down(blk_no, log->l_sectBBsize);
+ nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
- ASSERT(bp);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
{
int error;
- if (nbblks <= 0 || nbblks > log->l_logBBsize) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
- XFS_ERROR_REPORT("xlog_bwrite(1)",
- XFS_ERRLEVEL_HIGH, log->l_mp);
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return EFSCORRUPTED;
}
- if (log->l_sectbb_log) {
- blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
- nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
- }
+ blk_no = round_down(blk_no, log->l_sectBBsize);
+ nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
{
xfs_caddr_t offset;
xfs_daddr_t mid_blk;
+ xfs_daddr_t end_blk;
uint mid_cycle;
int error;
- mid_blk = BLK_AVG(first_blk, *last_blk);
- while (mid_blk != first_blk && mid_blk != *last_blk) {
+ end_blk = *last_blk;
+ mid_blk = BLK_AVG(first_blk, end_blk);
+ while (mid_blk != first_blk && mid_blk != end_blk) {
error = xlog_bread(log, mid_blk, 1, bp, &offset);
if (error)
return error;
mid_cycle = xlog_get_cycle(offset);
- if (mid_cycle == cycle) {
- *last_blk = mid_blk;
- /* last_half_cycle == mid_cycle */
- } else {
- first_blk = mid_blk;
- /* first_half_cycle == mid_cycle */
- }
- mid_blk = BLK_AVG(first_blk, *last_blk);
+ if (mid_cycle == cycle)
+ end_blk = mid_blk; /* last_half_cycle == mid_cycle */
+ else
+ first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+ mid_blk = BLK_AVG(first_blk, end_blk);
}
- ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
- (mid_blk == *last_blk && mid_blk-1 == first_blk));
+ ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+ (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+ *last_blk = end_blk;
return 0;
}
/*
- * Check that the range of blocks does not contain the cycle number
- * given. The scan needs to occur from front to back and the ptr into the
- * region must be updated since a later routine will need to perform another
- * test. If the region is completely good, we end up returning the same
- * last block number.
- *
- * Set blkno to -1 if we encounter no errors. This is an invalid block number
- * since we don't ever expect logs to get this large.
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range. The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
*/
STATIC int
xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
xfs_caddr_t buf = NULL;
int error = 0;
+ /*
+ * Greedily allocate a buffer big enough to handle the full
+ * range of basic blocks we'll be examining. If that fails,
+ * try a smaller size. We need to be able to read at least
+ * a log sector, or we're out of luck.
+ */
bufblks = 1 << ffs(nbblks);
-
while (!(bp = xlog_get_bp(log, bufblks))) {
- /* can't get enough memory to do everything in one big buffer */
bufblks >>= 1;
- if (bufblks <= log->l_sectbb_log)
+ if (bufblks < log->l_sectBBsize)
return ENOMEM;
}
@@ -629,7 +659,7 @@ xlog_find_head(
* In this case we want to find the first block with cycle
* number matching last_half_cycle. We expect the log to be
* some variation on
- * x + 1 ... | x ...
+ * x + 1 ... | x ... | x
* The first block with cycle number x (last_half_cycle) will
* be where the new head belongs. First we do a binary search
* for the first occurrence of last_half_cycle. The binary
@@ -639,11 +669,13 @@ xlog_find_head(
* the log, then we look for occurrences of last_half_cycle - 1
* at the end of the log. The cases we're looking for look
* like
- * x + 1 ... | x | x + 1 | x ...
- * ^ binary search stopped here
+ * v binary search stopped here
+ * x + 1 ... | x | x + 1 | x ... | x
+ * ^ but we want to locate this spot
* or
- * x + 1 ... | x ... | x - 1 | x
* <---------> less than scan distance
+ * x + 1 ... | x ... | x - 1 | x
+ * ^ we want to locate this spot
*/
stop_on_cycle = last_half_cycle;
if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
* certainly not the head of the log. By searching for
* last_half_cycle-1 we accomplish that.
*/
- start_blk = log_bbnum - num_scan_bblks + head_blk;
ASSERT(head_blk <= INT_MAX &&
- (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+ (xfs_daddr_t) num_scan_bblks >= head_blk);
+ start_blk = log_bbnum - (num_scan_bblks - head_blk);
if ((error = xlog_find_verify_cycle(log, start_blk,
num_scan_bblks - (int)head_blk,
(stop_on_cycle - 1), &new_blk)))
goto bp_err;
if (new_blk != -1) {
head_blk = new_blk;
- goto bad_blk;
+ goto validate_head;
}
/*
@@ -726,7 +758,7 @@ xlog_find_head(
head_blk = new_blk;
}
- bad_blk:
+validate_head:
/*
* Now we need to make sure head_blk is not pointing to a block in
* the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
if ((error = xlog_find_verify_log_record(log, start_blk,
&head_blk, 0)) == -1) {
/* We hit the beginning of the log during our search */
- start_blk = log_bbnum - num_scan_bblks + head_blk;
+ start_blk = log_bbnum - (num_scan_bblks - head_blk);
new_blk = log_bbnum;
ASSERT(start_blk <= INT_MAX &&
(xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
if (*head_blk == 0) { /* special case */
error = xlog_bread(log, 0, 1, bp, &offset);
if (error)
- goto bread_err;
+ goto done;
if (xlog_get_cycle(offset) == 0) {
*tail_blk = 0;
/* leave all other log inited values alone */
- goto exit;
+ goto done;
}
}
@@ -849,7 +881,7 @@ xlog_find_tail(
for (i = (int)(*head_blk) - 1; i >= 0; i--) {
error = xlog_bread(log, i, 1, bp, &offset);
if (error)
- goto bread_err;
+ goto done;
if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
error = xlog_bread(log, i, 1, bp, &offset);
if (error)
- goto bread_err;
+ goto done;
if (XLOG_HEADER_MAGIC_NUM ==
be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
umount_data_blk = (i + hblks) % log->l_logBBsize;
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
if (error)
- goto bread_err;
+ goto done;
op_head = (xlog_op_header_t *)offset;
if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
* But... if the -device- itself is readonly, just skip this.
* We can't recover this device anyway, so it won't matter.
*/
- if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+ if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
error = xlog_clear_stale_blocks(log, tail_lsn);
- }
-bread_err:
-exit:
+done:
xlog_put_bp(bp);
if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
xfs_caddr_t offset;
xfs_buf_t *bp;
int balign, ealign;
- int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+ int sectbb = log->l_sectBBsize;
int end_block = start_block + blocks;
int bufblks;
int error = 0;
int i, j = 0;
+ /*
+ * Greedily allocate a buffer big enough to handle the full
+ * range of basic blocks to be written. If that fails, try
+ * a smaller size. We need to be able to write at least a
+ * log sector, or we're out of luck.
+ */
bufblks = 1 << ffs(blocks);
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
- if (bufblks <= log->l_sectbb_log)
+ if (bufblks < sectbb)
return ENOMEM;
}
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
* the buffer in the starting sector not covered by the first
* write below.
*/
- balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+ balign = round_down(start_block, sectbb);
if (balign != start_block) {
error = xlog_bread_noalign(log, start_block, 1, bp);
if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
* the buffer in the final sector not covered by the write.
* If this is the same sector as the above read, skip it.
*/
- ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+ ealign = round_down(end_block, sectbb);
if (j == 0 && (start_block + endcount > ealign)) {
offset = XFS_BUF_PTR(bp);
balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
STATIC int
xlog_recover_add_to_cont_trans(
+ struct log *log,
xlog_recover_t *trans,
xfs_caddr_t dp,
int len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
return 0;
}
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
*/
STATIC int
xlog_recover_add_to_trans(
+ struct log *log,
xlog_recover_t *trans,
xfs_caddr_t dp,
int len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
item->ri_buf[item->ri_cnt].i_addr = ptr;
item->ri_buf[item->ri_cnt].i_len = len;
item->ri_cnt++;
+ trace_xfs_log_recover_item_add(log, trans, item, 0);
return 0;
}
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
*/
STATIC int
xlog_recover_reorder_trans(
- xlog_recover_t *trans)
+ struct log *log,
+ xlog_recover_t *trans,
+ int pass)
{
xlog_recover_item_t *item, *n;
LIST_HEAD(sort_list);
@@ -1535,6 +1577,8 @@ xlog_recover_reorder_trans(
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+ trace_xfs_log_recover_item_reorder_head(log,
+ trans, item, pass);
list_move(&item->ri_list, &trans->r_itemq);
break;
}
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
case XFS_LI_QUOTAOFF:
case XFS_LI_EFD:
case XFS_LI_EFI:
+ trace_xfs_log_recover_item_reorder_tail(log,
+ trans, item, pass);
list_move_tail(&item->ri_list, &trans->r_itemq);
break;
default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
/*
* If this isn't a cancel buffer item, then just return.
*/
- if (!(flags & XFS_BLI_CANCEL))
+ if (!(flags & XFS_BLI_CANCEL)) {
+ trace_xfs_log_recover_buf_not_cancel(log, buf_f);
return;
+ }
/*
* Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
while (nextp != NULL) {
if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
nextp->bc_refcount++;
+ trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
return;
}
prevp = nextp;
@@ -1640,6 +1689,7 @@ xlog_recover_do_buffer_pass1(
bcp->bc_refcount = 1;
bcp->bc_next = NULL;
prevp->bc_next = bcp;
+ trace_xfs_log_recover_buf_cancel_add(log, buf_f);
}
/*
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
unsigned int *data_map = NULL;
unsigned int map_size = 0;
+ trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
switch (buf_f->blf_type) {
case XFS_LI_BUF:
data_map = buf_f->blf_data_map;
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
/*ARGSUSED*/
STATIC void
xlog_recover_do_reg_buffer(
+ struct xfs_mount *mp,
xlog_recover_item_t *item,
xfs_buf_t *bp,
xfs_buf_log_format_t *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
unsigned int map_size = 0;
int error;
+ trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
switch (buf_f->blf_type) {
case XFS_LI_BUF:
data_map = buf_f->blf_data_map;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
{
uint type;
+ trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
/*
* Filesystems are required to send in quota flags at mount time.
*/
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
if (log->l_quotaoffs_flag & type)
return;
- xlog_recover_do_reg_buffer(item, bp, buf_f);
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
/*
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
*/
cancel = xlog_recover_do_buffer_pass2(log, buf_f);
if (cancel) {
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
return 0;
}
}
+ trace_xfs_log_recover_buf_recover(log, buf_f);
switch (buf_f->blf_type) {
case XFS_LI_BUF:
blkno = buf_f->blf_blkno;
@@ -2204,7 +2263,7 @@ xlog_recover_do_buffer_trans(
(XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
} else {
- xlog_recover_do_reg_buffer(item, bp, buf_f);
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
if (error)
return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
in_f->ilf_len, 0)) {
error = 0;
+ trace_xfs_log_recover_inode_cancel(log, in_f);
goto error;
}
+ trace_xfs_log_recover_inode_recover(log, in_f);
bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
/* do nothing */
} else {
xfs_buf_relse(bp);
+ trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
goto error;
}
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
int error = 0;
xlog_recover_item_t *item;
- error = xlog_recover_reorder_trans(trans);
+ error = xlog_recover_reorder_trans(log, trans, pass);
if (error)
return error;
list_for_each_entry(item, &trans->r_itemq, ri_list) {
+ trace_xfs_log_recover_item_recover(log, trans, item, pass);
switch (ITEM_TYPE(item)) {
case XFS_LI_BUF:
error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
error = xlog_recover_unmount_trans(trans);
break;
case XLOG_WAS_CONT_TRANS:
- error = xlog_recover_add_to_cont_trans(trans,
- dp, be32_to_cpu(ohead->oh_len));
+ error = xlog_recover_add_to_cont_trans(log,
+ trans, dp,
+ be32_to_cpu(ohead->oh_len));
break;
case XLOG_START_TRANS:
xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
break;
case 0:
case XLOG_CONTINUE_TRANS:
- error = xlog_recover_add_to_trans(trans,
+ error = xlog_recover_add_to_trans(log, trans,
dp, be32_to_cpu(ohead->oh_len));
break;
default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
}
}
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
- xlog_rec_header_t *rhead,
- xfs_caddr_t dp,
- xlog_t *log)
-{
- __be32 *up = (__be32 *)dp;
- uint chksum = 0;
- int i;
-
- /* divide length by 4 to get # words */
- for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
- chksum ^= be32_to_cpu(*up);
- up++;
- }
- if (chksum != be32_to_cpu(rhead->h_chksum)) {
- if (rhead->h_chksum ||
- ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
- cmn_err(CE_DEBUG,
- "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
- be32_to_cpu(rhead->h_chksum), chksum);
- cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- cmn_err(CE_DEBUG,
- "XFS: LogR this is a LogV2 filesystem\n");
- }
- log->l_flags |= XLOG_CHKSUM_MISMATCH;
- }
- }
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
-
STATIC void
xlog_unpack_data(
xlog_rec_header_t *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
-
- xlog_unpack_data_checksum(rhead, dp, log);
}
STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
hblks = 1;
}
} else {
- ASSERT(log->l_sectbb_log == 0);
+ ASSERT(log->l_sectBBsize == 1);
hblks = 1;
hbp = xlog_get_bp(log, 1);
h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
xfs_agf_t *agfp;
xfs_buf_t *agfbp;
xfs_buf_t *agibp;
- xfs_buf_t *sbbp;
-#ifdef XFS_LOUD_RECOVERY
- xfs_sb_t *sbp;
-#endif
xfs_agnumber_t agno;
__uint64_t freeblks;
__uint64_t itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
xfs_buf_relse(agibp);
}
}
-
- sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
- sbp = &mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
- cmn_err(CE_NOTE,
- "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
- sbp->sb_icount, itotal);
- cmn_err(CE_NOTE,
- "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
- sbp->sb_ifree, ifree);
- cmn_err(CE_NOTE,
- "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
- sbp->sb_fdblocks, freeblks);
-#if 0
- /*
- * This is turned off until I account for the allocation
- * btree blocks which live in free space.
- */
- ASSERT(sbp->sb_icount == itotal);
- ASSERT(sbp->sb_ifree == ifree);
- ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
- xfs_buf_relse(sbbp);
}
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b..d7bf38c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
xfs_qm_mount_quotas(mp);
}
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
- if (XFS_IS_QUOTA_ON(mp))
- xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
- else
- xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
-
/*
* Now we are mounted, reserve a small amount of unused space for
* privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f..e0e64b1 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */
#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358..be578ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -45,23 +45,12 @@
#include "xfs_trans_space.h"
#include "xfs_inode_item.h"
-
-STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void xfs_trans_committed(xfs_trans_t *, int);
-STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void xfs_trans_free(xfs_trans_t *);
-
kmem_zone_t *xfs_trans_zone;
-
/*
* Reservation functions here avoid a huge stack in xfs_trans_init
* due to register overflow from temporaries in the calculations.
*/
-
STATIC uint
xfs_calc_write_reservation(xfs_mount_t *mp)
{
@@ -261,6 +250,19 @@ _xfs_trans_alloc(
}
/*
+ * Free the transaction structure. If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+ xfs_trans_t *tp)
+{
+ atomic_dec(&tp->t_mountp->m_active_trans);
+ xfs_trans_free_dqinfo(tp);
+ kmem_zone_free(xfs_trans_zone, tp);
+}
+
+/*
* This is called to create a new transaction which will share the
* permanent log reservation of the given transaction. The remaining
* unused block and rt extent reservations are also inherited. This
@@ -764,94 +766,278 @@ xfs_trans_unreserve_and_mod_sb(
}
}
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction. The transaction itself needs one for the
+ * transaction header. Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+ struct xfs_trans *tp)
+{
+ int nvecs;
+ xfs_log_item_desc_t *lidp;
+
+ nvecs = 1;
+ lidp = xfs_trans_first_item(tp);
+ ASSERT(lidp != NULL);
+
+ /* In the non-debug case we need to start bailing out if we
+ * didn't find a log_item here, return zero and let trans_commit
+ * deal with it.
+ */
+ if (lidp == NULL)
+ return 0;
+
+ while (lidp != NULL) {
+ /*
+ * Skip items which aren't dirty in this transaction.
+ */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+ lidp = xfs_trans_next_item(tp, lidp);
+ continue;
+ }
+ lidp->lid_size = IOP_SIZE(lidp->lid_item);
+ nvecs += lidp->lid_size;
+ lidp = xfs_trans_next_item(tp, lidp);
+ }
+
+ return nvecs;
+}
/*
- * xfs_trans_commit
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction. The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
*
- * Commit the given transaction to the log a/synchronously.
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+ struct xfs_trans *tp,
+ struct xfs_log_iovec *log_vector)
+{
+ xfs_log_item_desc_t *lidp;
+ struct xfs_log_iovec *vecp;
+ uint nitems;
+
+ /*
+ * Skip over the entry for the transaction header, we'll
+ * fill that in at the end.
+ */
+ vecp = log_vector + 1;
+
+ nitems = 0;
+ lidp = xfs_trans_first_item(tp);
+ ASSERT(lidp);
+ while (lidp) {
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+ lidp = xfs_trans_next_item(tp, lidp);
+ continue;
+ }
+
+ /*
+ * The item may be marked dirty but not log anything. This can
+ * be used to get called when a transaction is committed.
+ */
+ if (lidp->lid_size)
+ nitems++;
+ IOP_FORMAT(lidp->lid_item, vecp);
+ vecp += lidp->lid_size;
+ IOP_PIN(lidp->lid_item);
+ lidp = xfs_trans_next_item(tp, lidp);
+ }
+
+ /*
+ * Now that we've counted the number of items in this transaction, fill
+ * in the transaction header. Note that the transaction header does not
+ * have a log item.
+ */
+ tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+ tp->t_header.th_type = tp->t_type;
+ tp->t_header.th_num_items = nitems;
+ log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+ log_vector->i_len = sizeof(xfs_trans_header_t);
+ log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item. If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
*
- * XFS disk error handling mechanism is not based on a typical
- * transaction abort mechanism. Logically after the filesystem
- * gets marked 'SHUTDOWN', we can't let any new transactions
- * be durable - ie. committed to disk - because some metadata might
- * be inconsistent. In such cases, this returns an error, and the
- * caller may assume that all locked objects joined to the transaction
- * have already been unlocked as if the commit had succeeded.
- * Do not reference the transaction structure after this call.
+ * Since items are unlocked when they are copied to the incore log, it is
+ * possible for two transactions to be completing and manipulating the same
+ * item simultaneously. The AIL lock will protect the lsn field of each item.
+ * The value of this field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because otherwise
+ * they could be immediately flushed and we'd have to race with the flusher
+ * trying to pull the item from the AIL as we add it.
*/
- /*ARGSUSED*/
-int
-_xfs_trans_commit(
- xfs_trans_t *tp,
- uint flags,
- int *log_flushed)
+static void
+xfs_trans_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn,
+ int aborted)
{
- xfs_log_iovec_t *log_vector;
- int nvec;
- xfs_mount_t *mp;
- xfs_lsn_t commit_lsn;
- /* REFERENCED */
- int error;
- int log_flags;
- int sync;
-#define XFS_TRANS_LOGVEC_COUNT 16
- xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
- struct xlog_in_core *commit_iclog;
- int shutdown;
+ xfs_lsn_t item_lsn;
+ struct xfs_ail *ailp;
- commit_lsn = -1;
+ if (aborted)
+ lip->li_flags |= XFS_LI_ABORTED;
+ item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+ /* If the committed routine returns -1, item has been freed. */
+ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+ return;
/*
- * Determine whether this commit is releasing a permanent
- * log reservation or not.
+ * If the returned lsn is greater than what it contained before, update
+ * the location of the item in the AIL. If it is not, then do nothing.
+ * Items can never move backwards in the AIL.
+ *
+ * While the new lsn should usually be greater, it is possible that a
+ * later transaction completing simultaneously with an earlier one
+ * using the same item could complete first with a higher lsn. This
+ * would cause the earlier transaction to fail the test below.
*/
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
+ ailp = lip->li_ailp;
+ spin_lock(&ailp->xa_lock);
+ if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+ /*
+ * This will set the item's lsn to item_lsn and update the
+ * position of the item in the AIL.
+ *
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_update(ailp, lip, item_lsn);
} else {
- log_flags = 0;
+ spin_unlock(&ailp->xa_lock);
}
- mp = tp->t_mountp;
/*
- * If there is nothing to be logged by the transaction,
- * then unlock all of the items associated with the
- * transaction and free the transaction structure.
- * Also make sure to return any reserved blocks to
- * the free pool.
+ * Now that we've repositioned the item in the AIL, unpin it so it can
+ * be flushed. Pass information about buffer stale state down from the
+ * log item flags, if anyone else stales the buffer we do not want to
+ * pay any attention to it.
*/
-shut_us_down:
- shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
- if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
- xfs_trans_unreserve_and_mod_sb(tp);
+ IOP_UNPIN(lip);
+}
+
+/* Clear all the per-AG busy list items listed in this transaction */
+static void
+xfs_trans_clear_busy_extents(
+ struct xfs_trans *tp)
+{
+ xfs_log_busy_chunk_t *lbcp;
+ xfs_log_busy_slot_t *lbsp;
+ int i;
+
+ for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
+ i = 0;
+ for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
+ if (XFS_LBC_ISFREE(lbcp, i))
+ continue;
+ xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
+ }
+ }
+ xfs_trans_free_busy(tp);
+}
+
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk. It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+ struct xfs_trans *tp,
+ int abortflag)
+{
+ xfs_log_item_desc_t *lidp;
+ xfs_log_item_chunk_t *licp;
+ xfs_log_item_chunk_t *next_licp;
+
+ /* Call the transaction's completion callback if there is one. */
+ if (tp->t_callback != NULL)
+ tp->t_callback(tp, tp->t_callarg);
+
+ for (lidp = xfs_trans_first_item(tp);
+ lidp != NULL;
+ lidp = xfs_trans_next_item(tp, lidp)) {
+ xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+ }
+
+ /* free the item chunks, ignoring the embedded chunk */
+ for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
+ next_licp = licp->lic_next;
+ kmem_free(licp);
+ }
+
+ xfs_trans_clear_busy_extents(tp);
+ xfs_trans_free(tp);
+}
+
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+ struct xfs_trans *tp,
+ uint flags)
+{
+ xfs_log_item_desc_t *lidp;
+
+ for (lidp = xfs_trans_first_item(tp);
+ lidp != NULL;
+ lidp = xfs_trans_next_item(tp, lidp)) {
/*
- * It is indeed possible for the transaction to be
- * not dirty but the dqinfo portion to be. All that
- * means is that we have some (non-persistent) quota
- * reservations that need to be unreserved.
+ * Unpin all but those that aren't dirty.
*/
- xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket,
- NULL, log_flags);
- if (commit_lsn == -1 && !shutdown)
- shutdown = XFS_ERROR(EIO);
- }
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
- xfs_trans_free_busy(tp);
- xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
- return (shutdown);
+ if (lidp->lid_flags & XFS_LID_DIRTY)
+ IOP_UNPIN_REMOVE(lidp->lid_item, tp);
}
- ASSERT(tp->t_ticket != NULL);
- /*
- * If we need to update the superblock, then do it now.
- */
- if (tp->t_flags & XFS_TRANS_SB_DIRTY)
- xfs_trans_apply_sb_deltas(tp);
- xfs_trans_apply_dquot_deltas(tp);
+ xfs_trans_unreserve_and_mod_sb(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp);
+
+ xfs_trans_free_items(tp, flags);
+ xfs_trans_free_busy(tp);
+ xfs_trans_free(tp);
+}
+
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ int shutdown;
+ int error;
+ int log_flags = 0;
+ struct xlog_in_core *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT 16
+ struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+ struct xfs_log_iovec *log_vector;
+ uint nvec;
+
/*
* Ask each log item how many log_vector entries it will
@@ -861,8 +1047,7 @@ shut_us_down:
*/
nvec = xfs_trans_count_vecs(tp);
if (nvec == 0) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- goto shut_us_down;
+ return ENOMEM; /* triggers a shutdown! */
} else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
log_vector = log_vector_fast;
} else {
@@ -877,6 +1062,9 @@ shut_us_down:
*/
xfs_trans_fill_vecs(tp, log_vector);
+ if (flags & XFS_TRANS_RELEASE_LOG_RES)
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+
error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
/*
@@ -884,18 +1072,17 @@ shut_us_down:
* at any time after this call. However, all the items associated
* with the transaction are still locked and pinned in memory.
*/
- commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+ *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
- tp->t_commit_lsn = commit_lsn;
- if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+ tp->t_commit_lsn = *commit_lsn;
+ if (nvec > XFS_TRANS_LOGVEC_COUNT)
kmem_free(log_vector);
- }
/*
* If we got a log write error. Unpin the logitems that we
* had pinned, clean up, free trans structure, and return error.
*/
- if (error || commit_lsn == -1) {
+ if (error || *commit_lsn == -1) {
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
return XFS_ERROR(EIO);
@@ -909,8 +1096,6 @@ shut_us_down:
*/
xfs_trans_unreserve_and_mod_sb(tp);
- sync = tp->t_flags & XFS_TRANS_SYNC;
-
/*
* Tell the LM to call the transaction completion routine
* when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1138,7 @@ shut_us_down:
* the commit lsn of this transaction for dependency tracking
* purposes.
*/
- xfs_trans_unlock_items(tp, commit_lsn);
+ xfs_trans_unlock_items(tp, *commit_lsn);
/*
* If we detected a log error earlier, finish committing
@@ -973,156 +1158,114 @@ shut_us_down:
* and the items are released we can finally allow the iclog to
* go to disk.
*/
- error = xfs_log_release_iclog(mp, commit_iclog);
-
- /*
- * If the transaction needs to be synchronous, then force the
- * log out now and wait for it.
- */
- if (sync) {
- if (!error) {
- error = _xfs_log_force_lsn(mp, commit_lsn,
- XFS_LOG_SYNC, log_flushed);
- }
- XFS_STATS_INC(xs_trans_sync);
- } else {
- XFS_STATS_INC(xs_trans_async);
- }
-
- return (error);
+ return xfs_log_release_iclog(mp, commit_iclog);
}
/*
- * Total up the number of log iovecs needed to commit this
- * transaction. The transaction itself needs one for the
- * transaction header. Ask each dirty item in turn how many
- * it needs to get the total.
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
*/
-STATIC uint
-xfs_trans_count_vecs(
- xfs_trans_t *tp)
+int
+_xfs_trans_commit(
+ struct xfs_trans *tp,
+ uint flags,
+ int *log_flushed)
{
- int nvecs;
- xfs_log_item_desc_t *lidp;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_lsn_t commit_lsn = -1;
+ int error = 0;
+ int log_flags = 0;
+ int sync = tp->t_flags & XFS_TRANS_SYNC;
- nvecs = 1;
- lidp = xfs_trans_first_item(tp);
- ASSERT(lidp != NULL);
-
- /* In the non-debug case we need to start bailing out if we
- * didn't find a log_item here, return zero and let trans_commit
- * deal with it.
+ /*
+ * Determine whether this commit is releasing a permanent
+ * log reservation or not.
*/
- if (lidp == NULL)
- return 0;
-
- while (lidp != NULL) {
- /*
- * Skip items which aren't dirty in this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
- lidp = xfs_trans_next_item(tp, lidp);
- continue;
- }
- lidp->lid_size = IOP_SIZE(lidp->lid_item);
- nvecs += lidp->lid_size;
- lidp = xfs_trans_next_item(tp, lidp);
+ if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ log_flags = XFS_LOG_REL_PERM_RESERV;
}
- return nvecs;
-}
-
-/*
- * Called from the trans_commit code when we notice that
- * the filesystem is in the middle of a forced shutdown.
- */
-STATIC void
-xfs_trans_uncommit(
- xfs_trans_t *tp,
- uint flags)
-{
- xfs_log_item_desc_t *lidp;
+ /*
+ * If there is nothing to be logged by the transaction,
+ * then unlock all of the items associated with the
+ * transaction and free the transaction structure.
+ * Also make sure to return any reserved blocks to
+ * the free pool.
+ */
+ if (!(tp->t_flags & XFS_TRANS_DIRTY))
+ goto out_unreserve;
- for (lidp = xfs_trans_first_item(tp);
- lidp != NULL;
- lidp = xfs_trans_next_item(tp, lidp)) {
- /*
- * Unpin all but those that aren't dirty.
- */
- if (lidp->lid_flags & XFS_LID_DIRTY)
- IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ goto out_unreserve;
}
- xfs_trans_unreserve_and_mod_sb(tp);
- xfs_trans_unreserve_and_mod_dquots(tp);
+ ASSERT(tp->t_ticket != NULL);
- xfs_trans_free_items(tp, flags);
- xfs_trans_free_busy(tp);
- xfs_trans_free(tp);
-}
+ /*
+ * If we need to update the superblock, then do it now.
+ */
+ if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+ xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
-/*
- * Fill in the vector with pointers to data to be logged
- * by this transaction. The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
- *
- * As each item fills in the entries it needs, also pin the item
- * so that it cannot be flushed out until the log write completes.
- */
-STATIC void
-xfs_trans_fill_vecs(
- xfs_trans_t *tp,
- xfs_log_iovec_t *log_vector)
-{
- xfs_log_item_desc_t *lidp;
- xfs_log_iovec_t *vecp;
- uint nitems;
+ error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+ if (error == ENOMEM) {
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ error = XFS_ERROR(EIO);
+ goto out_unreserve;
+ }
/*
- * Skip over the entry for the transaction header, we'll
- * fill that in at the end.
+ * If the transaction needs to be synchronous, then force the
+ * log out now and wait for it.
*/
- vecp = log_vector + 1; /* pointer arithmetic */
-
- nitems = 0;
- lidp = xfs_trans_first_item(tp);
- ASSERT(lidp != NULL);
- while (lidp != NULL) {
- /*
- * Skip items which aren't dirty in this transaction.
- */
- if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
- lidp = xfs_trans_next_item(tp, lidp);
- continue;
- }
- /*
- * The item may be marked dirty but not log anything.
- * This can be used to get called when a transaction
- * is committed.
- */
- if (lidp->lid_size) {
- nitems++;
+ if (sync) {
+ if (!error) {
+ error = _xfs_log_force_lsn(mp, commit_lsn,
+ XFS_LOG_SYNC, log_flushed);
}
- IOP_FORMAT(lidp->lid_item, vecp);
- vecp += lidp->lid_size; /* pointer arithmetic */
- IOP_PIN(lidp->lid_item);
- lidp = xfs_trans_next_item(tp, lidp);
+ XFS_STATS_INC(xs_trans_sync);
+ } else {
+ XFS_STATS_INC(xs_trans_async);
}
+ return error;
+
+out_unreserve:
+ xfs_trans_unreserve_and_mod_sb(tp);
+
/*
- * Now that we've counted the number of items in this
- * transaction, fill in the transaction header.
+ * It is indeed possible for the transaction to be not dirty but
+ * the dqinfo portion to be. All that means is that we have some
+ * (non-persistent) quota reservations that need to be unreserved.
*/
- tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
- tp->t_header.th_type = tp->t_type;
- tp->t_header.th_num_items = nitems;
- log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
- log_vector->i_len = sizeof(xfs_trans_header_t);
- log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
-}
+ xfs_trans_unreserve_and_mod_dquots(tp);
+ if (tp->t_ticket) {
+ commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ if (commit_lsn == -1 && !error)
+ error = XFS_ERROR(EIO);
+ }
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
+ xfs_trans_free_busy(tp);
+ xfs_trans_free(tp);
+ XFS_STATS_INC(xs_trans_empty);
+ return error;
+}
/*
* Unlock all of the transaction's items and free the transaction.
@@ -1200,20 +1343,6 @@ xfs_trans_cancel(
xfs_trans_free(tp);
}
-
-/*
- * Free the transaction structure. If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
- xfs_trans_t *tp)
-{
- atomic_dec(&tp->t_mountp->m_active_trans);
- xfs_trans_free_dqinfo(tp);
- kmem_zone_free(xfs_trans_zone, tp);
-}
-
/*
* Roll from one trans in the sequence of PERMANENT transactions to
* the next: permanent transactions are only flushed out when
@@ -1283,174 +1412,3 @@ xfs_trans_roll(
xfs_trans_ihold(trans, dp);
return 0;
}
-
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk. It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
- xfs_trans_t *tp,
- int abortflag)
-{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_chunk_t *next_licp;
- xfs_log_busy_chunk_t *lbcp;
- xfs_log_busy_slot_t *lbsp;
- int i;
-
- /*
- * Call the transaction's completion callback if there
- * is one.
- */
- if (tp->t_callback != NULL) {
- tp->t_callback(tp, tp->t_callarg);
- }
-
- /*
- * Special case the chunk embedded in the transaction.
- */
- licp = &(tp->t_items);
- if (!(xfs_lic_are_all_free(licp))) {
- xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
- }
-
- /*
- * Process the items in each chunk in turn.
- */
- licp = licp->lic_next;
- while (licp != NULL) {
- ASSERT(!xfs_lic_are_all_free(licp));
- xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
- next_licp = licp->lic_next;
- kmem_free(licp);
- licp = next_licp;
- }
-
- /*
- * Clear all the per-AG busy list items listed in this transaction
- */
- lbcp = &tp->t_busy;
- while (lbcp != NULL) {
- for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
- if (!XFS_LBC_ISFREE(lbcp, i)) {
- xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
- lbsp->lbc_idx);
- }
- }
- lbcp = lbcp->lbc_next;
- }
- xfs_trans_free_busy(tp);
-
- /*
- * That's it for the transaction structure. Free it.
- */
- xfs_trans_free(tp);
-}
-
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item. If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously. The AIL lock
- * will protect the lsn field of each item. The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
- xfs_log_item_chunk_t *licp,
- xfs_lsn_t lsn,
- int aborted)
-{
- xfs_log_item_desc_t *lidp;
- xfs_log_item_t *lip;
- xfs_lsn_t item_lsn;
- int i;
-
- lidp = licp->lic_descs;
- for (i = 0; i < licp->lic_unused; i++, lidp++) {
- struct xfs_ail *ailp;
-
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- lip = lidp->lid_item;
- if (aborted)
- lip->li_flags |= XFS_LI_ABORTED;
-
- /*
- * Send in the ABORTED flag to the COMMITTED routine
- * so that it knows whether the transaction was aborted
- * or not.
- */
- item_lsn = IOP_COMMITTED(lip, lsn);
-
- /*
- * If the committed routine returns -1, make
- * no more references to the item.
- */
- if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
- continue;
- }
-
- /*
- * If the returned lsn is greater than what it
- * contained before, update the location of the
- * item in the AIL. If it is not, then do nothing.
- * Items can never move backwards in the AIL.
- *
- * While the new lsn should usually be greater, it
- * is possible that a later transaction completing
- * simultaneously with an earlier one using the
- * same item could complete first with a higher lsn.
- * This would cause the earlier transaction to fail
- * the test below.
- */
- ailp = lip->li_ailp;
- spin_lock(&ailp->xa_lock);
- if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
- /*
- * This will set the item's lsn to item_lsn
- * and update the position of the item in
- * the AIL.
- *
- * xfs_trans_ail_update() drops the AIL lock.
- */
- xfs_trans_ail_update(ailp, lip, item_lsn);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
-
- /*
- * Now that we've repositioned the item in the AIL,
- * unpin it so it can be flushed. Pass information
- * about buffer stale state down from the log item
- * flags, if anyone else stales the buffer we do not
- * want to pay any attention to it.
- */
- IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
- }
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab..c62beee 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
#define XFS_LI_DQUOT 0x123d
#define XFS_LI_QUOTAOFF 0x123e
+#define XFS_LI_TYPE_DESC \
+ { XFS_LI_EFI, "XFS_LI_EFI" }, \
+ { XFS_LI_EFD, "XFS_LI_EFD" }, \
+ { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
+ { XFS_LI_INODE, "XFS_LI_INODE" }, \
+ { XFS_LI_BUF, "XFS_LI_BUF" }, \
+ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
+ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
+
/*
* Transaction types. Used to distinguish types of buffers.
*/
@@ -159,7 +168,6 @@ typedef struct xfs_log_item_desc {
#define XFS_LID_DIRTY 0x1
#define XFS_LID_PINNED 0x2
-#define XFS_LID_BUF_STALE 0x8
/*
* This structure is used to maintain a chunk list of log_item_desc
@@ -833,7 +841,7 @@ typedef struct xfs_item_ops {
uint (*iop_size)(xfs_log_item_t *);
void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
void (*iop_pin)(xfs_log_item_t *);
- void (*iop_unpin)(xfs_log_item_t *, int);
+ void (*iop_unpin)(xfs_log_item_t *);
void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
uint (*iop_trylock)(xfs_log_item_t *);
void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +854,7 @@ typedef struct xfs_item_ops {
#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip)
#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb58636..9cd8090 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
#include "xfs_rw.h"
#include "xfs_trace.h"
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+ struct xfs_trans *tp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ int len)
+{
+ xfs_log_item_chunk_t *licp;
+ xfs_log_item_desc_t *lidp;
+ xfs_buf_log_item_t *blip;
+ int i;
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
- xfs_daddr_t, int);
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
- xfs_daddr_t, int);
+ len = BBTOB(len);
+ for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+ if (xfs_lic_are_all_free(licp)) {
+ ASSERT(licp == &tp->t_items);
+ ASSERT(licp->lic_next == NULL);
+ return NULL;
+ }
+
+ for (i = 0; i < licp->lic_unused; i++) {
+ /*
+ * Skip unoccupied slots.
+ */
+ if (xfs_lic_isfree(licp, i))
+ continue;
+
+ lidp = xfs_lic_slot(licp, i);
+ blip = (xfs_buf_log_item_t *)lidp->lid_item;
+ if (blip->bli_item.li_type != XFS_LI_BUF)
+ continue;
+
+ if (XFS_BUF_TARGET(blip->bli_buf) == target &&
+ XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+ XFS_BUF_COUNT(blip->bli_buf) == len)
+ return blip->bli_buf;
+ }
+ }
+
+ return NULL;
+}
/*
* Add the locked buffer to the transaction.
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
* within the transaction, just increment its lock recursion count
* and return a pointer to it.
*
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
* If the transaction pointer is NULL, make this just a normal
* get_buf() call.
*/
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp,
* have it locked. In this case we just increment the lock
* recursion count and return the buffer to the caller.
*/
- if (tp->t_items.lic_next == NULL) {
- bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
- } else {
- bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
- }
+ bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
if (bp != NULL) {
ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int xfs_error_mod = 33;
* within the transaction and already read in, just increment its
* lock recursion count and return a pointer to it.
*
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
* If the transaction pointer is NULL, make this just a normal
* read_buf() call.
*/
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
* If the buffer is not yet read in, then we read it in, increment
* the lock recursion count, and return it to the caller.
*/
- if (tp->t_items.lic_next == NULL) {
- bp = xfs_trans_buf_item_match(tp, target, blkno, len);
- } else {
- bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
- }
+ bp = xfs_trans_buf_item_match(tp, target, blkno, len);
if (bp != NULL) {
ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY;
lidp->lid_flags |= XFS_LID_DIRTY;
- lidp->lid_flags &= ~XFS_LID_BUF_STALE;
bip->bli_flags |= XFS_BLI_LOGGED;
xfs_buf_item_log(bip, first, last);
}
@@ -782,7 +797,7 @@ xfs_trans_binval(
bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
memset((char *)(bip->bli_format.blf_data_map), 0,
(bip->bli_format.blf_map_size * sizeof(uint)));
- lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+ lidp->lid_flags |= XFS_LID_DIRTY;
tp->t_flags |= XFS_TRANS_DIRTY;
}
@@ -902,111 +917,3 @@ xfs_trans_dquot_buf(
bip->bli_format.blf_flags |= type;
}
-
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction. Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
- xfs_trans_t *tp,
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- int len)
-{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_desc_t *lidp;
- xfs_buf_log_item_t *blip;
- xfs_buf_t *bp;
- int i;
-
- bp = NULL;
- len = BBTOB(len);
- licp = &tp->t_items;
- if (!xfs_lic_are_all_free(licp)) {
- for (i = 0; i < licp->lic_unused; i++) {
- /*
- * Skip unoccupied slots.
- */
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- lidp = xfs_lic_slot(licp, i);
- blip = (xfs_buf_log_item_t *)lidp->lid_item;
- if (blip->bli_item.li_type != XFS_LI_BUF) {
- continue;
- }
-
- bp = blip->bli_buf;
- if ((XFS_BUF_TARGET(bp) == target) &&
- (XFS_BUF_ADDR(bp) == blkno) &&
- (XFS_BUF_COUNT(bp) == len)) {
- /*
- * We found it. Break out and
- * return the pointer to the buffer.
- */
- break;
- } else {
- bp = NULL;
- }
- }
- }
- return bp;
-}
-
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction. Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
- xfs_trans_t *tp,
- xfs_buftarg_t *target,
- xfs_daddr_t blkno,
- int len)
-{
- xfs_log_item_chunk_t *licp;
- xfs_log_item_desc_t *lidp;
- xfs_buf_log_item_t *blip;
- xfs_buf_t *bp;
- int i;
-
- bp = NULL;
- len = BBTOB(len);
- for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
- if (xfs_lic_are_all_free(licp)) {
- ASSERT(licp == &tp->t_items);
- ASSERT(licp->lic_next == NULL);
- return NULL;
- }
- for (i = 0; i < licp->lic_unused; i++) {
- /*
- * Skip unoccupied slots.
- */
- if (xfs_lic_isfree(licp, i)) {
- continue;
- }
-
- lidp = xfs_lic_slot(licp, i);
- blip = (xfs_buf_log_item_t *)lidp->lid_item;
- if (blip->bli_item.li_type != XFS_LI_BUF) {
- continue;
- }
-
- bp = blip->bli_buf;
- if ((XFS_BUF_TARGET(bp) == target) &&
- (XFS_BUF_ADDR(bp) == blkno) &&
- (XFS_BUF_COUNT(bp) == len)) {
- /*
- * We found it. Break out and
- * return the pointer to the buffer.
- */
- return bp;
- }
- }
- }
- return NULL;
-}